github.com/MetalBlockchain/metalgo@v1.11.9/x/sync/manager.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package sync
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"errors"
    10  	"fmt"
    11  	"slices"
    12  	"sync"
    13  
    14  	"go.uber.org/zap"
    15  	"golang.org/x/exp/maps"
    16  
    17  	"github.com/MetalBlockchain/metalgo/ids"
    18  	"github.com/MetalBlockchain/metalgo/utils/logging"
    19  	"github.com/MetalBlockchain/metalgo/utils/maybe"
    20  	"github.com/MetalBlockchain/metalgo/utils/set"
    21  	"github.com/MetalBlockchain/metalgo/x/merkledb"
    22  
    23  	pb "github.com/MetalBlockchain/metalgo/proto/pb/sync"
    24  )
    25  
    26  const (
    27  	defaultRequestKeyLimit      = maxKeyValuesLimit
    28  	defaultRequestByteSizeLimit = maxByteSizeLimit
    29  )
    30  
    31  var (
    32  	ErrAlreadyStarted             = errors.New("cannot start a Manager that has already been started")
    33  	ErrAlreadyClosed              = errors.New("Manager is closed")
    34  	ErrNoClientProvided           = errors.New("client is a required field of the sync config")
    35  	ErrNoDatabaseProvided         = errors.New("sync database is a required field of the sync config")
    36  	ErrNoLogProvided              = errors.New("log is a required field of the sync config")
    37  	ErrZeroWorkLimit              = errors.New("simultaneous work limit must be greater than 0")
    38  	ErrFinishedWithUnexpectedRoot = errors.New("finished syncing with an unexpected root")
    39  )
    40  
    41  type priority byte
    42  
    43  // Note that [highPriority] > [medPriority] > [lowPriority].
    44  const (
    45  	lowPriority priority = iota + 1
    46  	medPriority
    47  	highPriority
    48  )
    49  
    50  // Signifies that we should sync the range [start, end].
    51  // nil [start] means there is no lower bound.
    52  // nil [end] means there is no upper bound.
    53  // [localRootID] is the ID of the root of this range in our database.
    54  // If we have no local root for this range, [localRootID] is ids.Empty.
    55  type workItem struct {
    56  	start       maybe.Maybe[[]byte]
    57  	end         maybe.Maybe[[]byte]
    58  	priority    priority
    59  	localRootID ids.ID
    60  }
    61  
    62  func newWorkItem(localRootID ids.ID, start maybe.Maybe[[]byte], end maybe.Maybe[[]byte], priority priority) *workItem {
    63  	return &workItem{
    64  		localRootID: localRootID,
    65  		start:       start,
    66  		end:         end,
    67  		priority:    priority,
    68  	}
    69  }
    70  
    71  type Manager struct {
    72  	// Must be held when accessing [config.TargetRoot].
    73  	syncTargetLock sync.RWMutex
    74  	config         ManagerConfig
    75  
    76  	workLock sync.Mutex
    77  	// The number of work items currently being processed.
    78  	// Namely, the number of goroutines executing [doWork].
    79  	// [workLock] must be held when accessing [processingWorkItems].
    80  	processingWorkItems int
    81  	// [workLock] must be held while accessing [unprocessedWork].
    82  	unprocessedWork *workHeap
    83  	// Signalled when:
    84  	// - An item is added to [unprocessedWork].
    85  	// - An item is added to [processedWork].
    86  	// - Close() is called.
    87  	// [workLock] is its inner lock.
    88  	unprocessedWorkCond sync.Cond
    89  	// [workLock] must be held while accessing [processedWork].
    90  	processedWork *workHeap
    91  
    92  	// When this is closed:
    93  	// - [closed] is true.
    94  	// - [cancelCtx] was called.
    95  	// - [workToBeDone] and [completedWork] are closed.
    96  	doneChan chan struct{}
    97  
    98  	errLock sync.Mutex
    99  	// If non-nil, there was a fatal error.
   100  	// [errLock] must be held when accessing [fatalError].
   101  	fatalError error
   102  
   103  	// Cancels all currently processing work items.
   104  	cancelCtx context.CancelFunc
   105  
   106  	// Set to true when StartSyncing is called.
   107  	syncing   bool
   108  	closeOnce sync.Once
   109  	tokenSize int
   110  }
   111  
   112  type ManagerConfig struct {
   113  	DB                    DB
   114  	Client                Client
   115  	SimultaneousWorkLimit int
   116  	Log                   logging.Logger
   117  	TargetRoot            ids.ID
   118  	BranchFactor          merkledb.BranchFactor
   119  }
   120  
   121  func NewManager(config ManagerConfig) (*Manager, error) {
   122  	switch {
   123  	case config.Client == nil:
   124  		return nil, ErrNoClientProvided
   125  	case config.DB == nil:
   126  		return nil, ErrNoDatabaseProvided
   127  	case config.Log == nil:
   128  		return nil, ErrNoLogProvided
   129  	case config.SimultaneousWorkLimit == 0:
   130  		return nil, ErrZeroWorkLimit
   131  	}
   132  	if err := config.BranchFactor.Valid(); err != nil {
   133  		return nil, err
   134  	}
   135  
   136  	m := &Manager{
   137  		config:          config,
   138  		doneChan:        make(chan struct{}),
   139  		unprocessedWork: newWorkHeap(),
   140  		processedWork:   newWorkHeap(),
   141  		tokenSize:       merkledb.BranchFactorToTokenSize[config.BranchFactor],
   142  	}
   143  	m.unprocessedWorkCond.L = &m.workLock
   144  
   145  	return m, nil
   146  }
   147  
   148  func (m *Manager) Start(ctx context.Context) error {
   149  	m.workLock.Lock()
   150  	defer m.workLock.Unlock()
   151  
   152  	if m.syncing {
   153  		return ErrAlreadyStarted
   154  	}
   155  
   156  	m.config.Log.Info("starting sync", zap.Stringer("target root", m.config.TargetRoot))
   157  
   158  	// Add work item to fetch the entire key range.
   159  	// Note that this will be the first work item to be processed.
   160  	m.unprocessedWork.Insert(newWorkItem(ids.Empty, maybe.Nothing[[]byte](), maybe.Nothing[[]byte](), lowPriority))
   161  
   162  	m.syncing = true
   163  	ctx, m.cancelCtx = context.WithCancel(ctx)
   164  
   165  	go m.sync(ctx)
   166  	return nil
   167  }
   168  
   169  // sync awaits signal on [m.unprocessedWorkCond], which indicates that there
   170  // is work to do or syncing completes.  If there is work, sync will dispatch a goroutine to do
   171  // the work.
   172  func (m *Manager) sync(ctx context.Context) {
   173  	defer func() {
   174  		// Invariant: [m.workLock] is held when this goroutine begins.
   175  		m.close()
   176  		m.workLock.Unlock()
   177  	}()
   178  
   179  	// Keep doing work until we're closed, done or [ctx] is canceled.
   180  	m.workLock.Lock()
   181  	for {
   182  		// Invariant: [m.workLock] is held here.
   183  		switch {
   184  		case ctx.Err() != nil:
   185  			return // [m.workLock] released by defer.
   186  		case m.processingWorkItems >= m.config.SimultaneousWorkLimit:
   187  			// We're already processing the maximum number of work items.
   188  			// Wait until one of them finishes.
   189  			m.unprocessedWorkCond.Wait()
   190  		case m.unprocessedWork.Len() == 0:
   191  			if m.processingWorkItems == 0 {
   192  				// There's no work to do, and there are no work items being processed
   193  				// which could cause work to be added, so we're done.
   194  				return // [m.workLock] released by defer.
   195  			}
   196  			// There's no work to do.
   197  			// Note that if [m].Close() is called, or [ctx] is canceled,
   198  			// Close() will be called, which will broadcast on [m.unprocessedWorkCond],
   199  			// which will cause Wait() to return, and this goroutine to exit.
   200  			m.unprocessedWorkCond.Wait()
   201  		default:
   202  			m.processingWorkItems++
   203  			work := m.unprocessedWork.GetWork()
   204  			go m.doWork(ctx, work)
   205  		}
   206  	}
   207  }
   208  
   209  // Close will stop the syncing process
   210  func (m *Manager) Close() {
   211  	m.workLock.Lock()
   212  	defer m.workLock.Unlock()
   213  
   214  	m.close()
   215  }
   216  
   217  // close is called when there is a fatal error or sync is complete.
   218  // [workLock] must be held
   219  func (m *Manager) close() {
   220  	m.closeOnce.Do(func() {
   221  		// Don't process any more work items.
   222  		// Drop currently processing work items.
   223  		if m.cancelCtx != nil {
   224  			m.cancelCtx()
   225  		}
   226  
   227  		// ensure any goroutines waiting for work from the heaps gets released
   228  		m.unprocessedWork.Close()
   229  		m.unprocessedWorkCond.Signal()
   230  		m.processedWork.Close()
   231  
   232  		// signal all code waiting on the sync to complete
   233  		close(m.doneChan)
   234  	})
   235  }
   236  
   237  // Processes [item] by fetching and applying a change or range proof.
   238  // Assumes [m.workLock] is not held.
   239  func (m *Manager) doWork(ctx context.Context, work *workItem) {
   240  	defer func() {
   241  		m.workLock.Lock()
   242  		defer m.workLock.Unlock()
   243  
   244  		m.processingWorkItems--
   245  		m.unprocessedWorkCond.Signal()
   246  	}()
   247  
   248  	if work.localRootID == ids.Empty {
   249  		// the keys in this range have not been downloaded, so get all key/values
   250  		m.getAndApplyRangeProof(ctx, work)
   251  	} else {
   252  		// the keys in this range have already been downloaded, but the root changed, so get all changes
   253  		m.getAndApplyChangeProof(ctx, work)
   254  	}
   255  }
   256  
   257  // Fetch and apply the change proof given by [work].
   258  // Assumes [m.workLock] is not held.
   259  func (m *Manager) getAndApplyChangeProof(ctx context.Context, work *workItem) {
   260  	targetRootID := m.getTargetRoot()
   261  
   262  	if work.localRootID == targetRootID {
   263  		// Start root is the same as the end root, so we're done.
   264  		m.completeWorkItem(ctx, work, work.end, targetRootID, nil)
   265  		return
   266  	}
   267  
   268  	if targetRootID == ids.Empty {
   269  		// The trie is empty after this change.
   270  		// Delete all the key-value pairs in the range.
   271  		if err := m.config.DB.Clear(); err != nil {
   272  			m.setError(err)
   273  			return
   274  		}
   275  		work.start = maybe.Nothing[[]byte]()
   276  		m.completeWorkItem(ctx, work, maybe.Nothing[[]byte](), targetRootID, nil)
   277  		return
   278  	}
   279  
   280  	changeOrRangeProof, err := m.config.Client.GetChangeProof(
   281  		ctx,
   282  		&pb.SyncGetChangeProofRequest{
   283  			StartRootHash: work.localRootID[:],
   284  			EndRootHash:   targetRootID[:],
   285  			StartKey: &pb.MaybeBytes{
   286  				Value:     work.start.Value(),
   287  				IsNothing: work.start.IsNothing(),
   288  			},
   289  			EndKey: &pb.MaybeBytes{
   290  				Value:     work.end.Value(),
   291  				IsNothing: work.end.IsNothing(),
   292  			},
   293  			KeyLimit:   defaultRequestKeyLimit,
   294  			BytesLimit: defaultRequestByteSizeLimit,
   295  		},
   296  		m.config.DB,
   297  	)
   298  	if err != nil {
   299  		m.setError(err)
   300  		return
   301  	}
   302  
   303  	select {
   304  	case <-m.doneChan:
   305  		// If we're closed, don't apply the proof.
   306  		return
   307  	default:
   308  	}
   309  
   310  	if changeOrRangeProof.ChangeProof != nil {
   311  		// The server had sufficient history to respond with a change proof.
   312  		changeProof := changeOrRangeProof.ChangeProof
   313  		largestHandledKey := work.end
   314  		// if the proof wasn't empty, apply changes to the sync DB
   315  		if len(changeProof.KeyChanges) > 0 {
   316  			if err := m.config.DB.CommitChangeProof(ctx, changeProof); err != nil {
   317  				m.setError(err)
   318  				return
   319  			}
   320  			largestHandledKey = maybe.Some(changeProof.KeyChanges[len(changeProof.KeyChanges)-1].Key)
   321  		}
   322  
   323  		m.completeWorkItem(ctx, work, largestHandledKey, targetRootID, changeProof.EndProof)
   324  		return
   325  	}
   326  
   327  	// The server responded with a range proof.
   328  	rangeProof := changeOrRangeProof.RangeProof
   329  	largestHandledKey := work.end
   330  	if len(rangeProof.KeyValues) > 0 {
   331  		// Add all the key-value pairs we got to the database.
   332  		if err := m.config.DB.CommitRangeProof(ctx, work.start, work.end, rangeProof); err != nil {
   333  			m.setError(err)
   334  			return
   335  		}
   336  		largestHandledKey = maybe.Some(rangeProof.KeyValues[len(rangeProof.KeyValues)-1].Key)
   337  	}
   338  
   339  	m.completeWorkItem(ctx, work, largestHandledKey, targetRootID, rangeProof.EndProof)
   340  }
   341  
   342  // Fetch and apply the range proof given by [work].
   343  // Assumes [m.workLock] is not held.
   344  func (m *Manager) getAndApplyRangeProof(ctx context.Context, work *workItem) {
   345  	targetRootID := m.getTargetRoot()
   346  
   347  	if targetRootID == ids.Empty {
   348  		if err := m.config.DB.Clear(); err != nil {
   349  			m.setError(err)
   350  			return
   351  		}
   352  		work.start = maybe.Nothing[[]byte]()
   353  		m.completeWorkItem(ctx, work, maybe.Nothing[[]byte](), targetRootID, nil)
   354  		return
   355  	}
   356  
   357  	proof, err := m.config.Client.GetRangeProof(ctx,
   358  		&pb.SyncGetRangeProofRequest{
   359  			RootHash: targetRootID[:],
   360  			StartKey: &pb.MaybeBytes{
   361  				Value:     work.start.Value(),
   362  				IsNothing: work.start.IsNothing(),
   363  			},
   364  			EndKey: &pb.MaybeBytes{
   365  				Value:     work.end.Value(),
   366  				IsNothing: work.end.IsNothing(),
   367  			},
   368  			KeyLimit:   defaultRequestKeyLimit,
   369  			BytesLimit: defaultRequestByteSizeLimit,
   370  		},
   371  	)
   372  	if err != nil {
   373  		m.setError(err)
   374  		return
   375  	}
   376  
   377  	select {
   378  	case <-m.doneChan:
   379  		// If we're closed, don't apply the proof.
   380  		return
   381  	default:
   382  	}
   383  
   384  	largestHandledKey := work.end
   385  
   386  	// Replace all the key-value pairs in the DB from start to end with values from the response.
   387  	if err := m.config.DB.CommitRangeProof(ctx, work.start, work.end, proof); err != nil {
   388  		m.setError(err)
   389  		return
   390  	}
   391  
   392  	if len(proof.KeyValues) > 0 {
   393  		largestHandledKey = maybe.Some(proof.KeyValues[len(proof.KeyValues)-1].Key)
   394  	}
   395  
   396  	m.completeWorkItem(ctx, work, largestHandledKey, targetRootID, proof.EndProof)
   397  }
   398  
   399  // findNextKey returns the start of the key range that should be fetched next
   400  // given that we just received a range/change proof that proved a range of
   401  // key-value pairs ending at [lastReceivedKey].
   402  //
   403  // [rangeEnd] is the end of the range that we want to fetch.
   404  //
   405  // Returns Nothing if there are no more keys to fetch in [lastReceivedKey, rangeEnd].
   406  //
   407  // [endProof] is the end proof of the last proof received.
   408  //
   409  // Invariant: [lastReceivedKey] < [rangeEnd].
   410  // If [rangeEnd] is Nothing it's considered > [lastReceivedKey].
   411  func (m *Manager) findNextKey(
   412  	ctx context.Context,
   413  	lastReceivedKey []byte,
   414  	rangeEnd maybe.Maybe[[]byte],
   415  	endProof []merkledb.ProofNode,
   416  ) (maybe.Maybe[[]byte], error) {
   417  	if len(endProof) == 0 {
   418  		// We try to find the next key to fetch by looking at the end proof.
   419  		// If the end proof is empty, we have no information to use.
   420  		// Start fetching from the next key after [lastReceivedKey].
   421  		nextKey := lastReceivedKey
   422  		nextKey = append(nextKey, 0)
   423  		return maybe.Some(nextKey), nil
   424  	}
   425  
   426  	// We want the first key larger than the [lastReceivedKey].
   427  	// This is done by taking two proofs for the same key
   428  	// (one that was just received as part of a proof, and one from the local db)
   429  	// and traversing them from the longest key to the shortest key.
   430  	// For each node in these proofs, compare if the children of that node exist
   431  	// or have the same ID in the other proof.
   432  	proofKeyPath := merkledb.ToKey(lastReceivedKey)
   433  
   434  	// If the received proof is an exclusion proof, the last node may be for a
   435  	// key that is after the [lastReceivedKey].
   436  	// If the last received node's key is after the [lastReceivedKey], it can
   437  	// be removed to obtain a valid proof for a prefix of the [lastReceivedKey].
   438  	if !proofKeyPath.HasPrefix(endProof[len(endProof)-1].Key) {
   439  		endProof = endProof[:len(endProof)-1]
   440  		// update the proofKeyPath to be for the prefix
   441  		proofKeyPath = endProof[len(endProof)-1].Key
   442  	}
   443  
   444  	// get a proof for the same key as the received proof from the local db
   445  	localProofOfKey, err := m.config.DB.GetProof(ctx, proofKeyPath.Bytes())
   446  	if err != nil {
   447  		return maybe.Nothing[[]byte](), err
   448  	}
   449  	localProofNodes := localProofOfKey.Path
   450  
   451  	// The local proof may also be an exclusion proof with an extra node.
   452  	// Remove this extra node if it exists to get a proof of the same key as the received proof
   453  	if !proofKeyPath.HasPrefix(localProofNodes[len(localProofNodes)-1].Key) {
   454  		localProofNodes = localProofNodes[:len(localProofNodes)-1]
   455  	}
   456  
   457  	nextKey := maybe.Nothing[[]byte]()
   458  
   459  	// Add sentinel node back into the localProofNodes, if it is missing.
   460  	// Required to ensure that a common node exists in both proofs
   461  	if len(localProofNodes) > 0 && localProofNodes[0].Key.Length() != 0 {
   462  		sentinel := merkledb.ProofNode{
   463  			Children: map[byte]ids.ID{
   464  				localProofNodes[0].Key.Token(0, m.tokenSize): ids.Empty,
   465  			},
   466  		}
   467  		localProofNodes = append([]merkledb.ProofNode{sentinel}, localProofNodes...)
   468  	}
   469  
   470  	// Add sentinel node back into the endProof, if it is missing.
   471  	// Required to ensure that a common node exists in both proofs
   472  	if len(endProof) > 0 && endProof[0].Key.Length() != 0 {
   473  		sentinel := merkledb.ProofNode{
   474  			Children: map[byte]ids.ID{
   475  				endProof[0].Key.Token(0, m.tokenSize): ids.Empty,
   476  			},
   477  		}
   478  		endProof = append([]merkledb.ProofNode{sentinel}, endProof...)
   479  	}
   480  
   481  	localProofNodeIndex := len(localProofNodes) - 1
   482  	receivedProofNodeIndex := len(endProof) - 1
   483  
   484  	// traverse the two proofs from the deepest nodes up to the sentinel node until a difference is found
   485  	for localProofNodeIndex >= 0 && receivedProofNodeIndex >= 0 && nextKey.IsNothing() {
   486  		localProofNode := localProofNodes[localProofNodeIndex]
   487  		receivedProofNode := endProof[receivedProofNodeIndex]
   488  
   489  		// [deepestNode] is the proof node with the longest key (deepest in the trie) in the
   490  		// two proofs that hasn't been handled yet.
   491  		// [deepestNodeFromOtherProof] is the proof node from the other proof with
   492  		// the same key/depth if it exists, nil otherwise.
   493  		var deepestNode, deepestNodeFromOtherProof *merkledb.ProofNode
   494  
   495  		// select the deepest proof node from the two proofs
   496  		switch {
   497  		case receivedProofNode.Key.Length() > localProofNode.Key.Length():
   498  			// there was a branch node in the received proof that isn't in the local proof
   499  			// see if the received proof node has children not present in the local proof
   500  			deepestNode = &receivedProofNode
   501  
   502  			// we have dealt with this received node, so move on to the next received node
   503  			receivedProofNodeIndex--
   504  
   505  		case localProofNode.Key.Length() > receivedProofNode.Key.Length():
   506  			// there was a branch node in the local proof that isn't in the received proof
   507  			// see if the local proof node has children not present in the received proof
   508  			deepestNode = &localProofNode
   509  
   510  			// we have dealt with this local node, so move on to the next local node
   511  			localProofNodeIndex--
   512  
   513  		default:
   514  			// the two nodes are at the same depth
   515  			// see if any of the children present in the local proof node are different
   516  			// from the children in the received proof node
   517  			deepestNode = &localProofNode
   518  			deepestNodeFromOtherProof = &receivedProofNode
   519  
   520  			// we have dealt with this local node and received node, so move on to the next nodes
   521  			localProofNodeIndex--
   522  			receivedProofNodeIndex--
   523  		}
   524  
   525  		// We only want to look at the children with keys greater than the proofKey.
   526  		// The proof key has the deepest node's key as a prefix,
   527  		// so only the next token of the proof key needs to be considered.
   528  
   529  		// If the deepest node has the same key as [proofKeyPath],
   530  		// then all of its children have keys greater than the proof key,
   531  		// so we can start at the 0 token.
   532  		startingChildToken := 0
   533  
   534  		// If the deepest node has a key shorter than the key being proven,
   535  		// we can look at the next token index of the proof key to determine which of that
   536  		// node's children have keys larger than [proofKeyPath].
   537  		// Any child with a token greater than the [proofKeyPath]'s token at that
   538  		// index will have a larger key.
   539  		if deepestNode.Key.Length() < proofKeyPath.Length() {
   540  			startingChildToken = int(proofKeyPath.Token(deepestNode.Key.Length(), m.tokenSize)) + 1
   541  		}
   542  
   543  		// determine if there are any differences in the children for the deepest unhandled node of the two proofs
   544  		if childIndex, hasDifference := findChildDifference(deepestNode, deepestNodeFromOtherProof, startingChildToken); hasDifference {
   545  			nextKey = maybe.Some(deepestNode.Key.Extend(merkledb.ToToken(childIndex, m.tokenSize)).Bytes())
   546  			break
   547  		}
   548  	}
   549  
   550  	// If the nextKey is before or equal to the [lastReceivedKey]
   551  	// then we couldn't find a better answer than the [lastReceivedKey].
   552  	// Set the nextKey to [lastReceivedKey] + 0, which is the first key in
   553  	// the open range (lastReceivedKey, rangeEnd).
   554  	if nextKey.HasValue() && bytes.Compare(nextKey.Value(), lastReceivedKey) <= 0 {
   555  		nextKeyVal := slices.Clone(lastReceivedKey)
   556  		nextKeyVal = append(nextKeyVal, 0)
   557  		nextKey = maybe.Some(nextKeyVal)
   558  	}
   559  
   560  	// If the [nextKey] is larger than the end of the range, return Nothing to signal that there is no next key in range
   561  	if rangeEnd.HasValue() && bytes.Compare(nextKey.Value(), rangeEnd.Value()) >= 0 {
   562  		return maybe.Nothing[[]byte](), nil
   563  	}
   564  
   565  	// the nextKey is within the open range (lastReceivedKey, rangeEnd), so return it
   566  	return nextKey, nil
   567  }
   568  
   569  func (m *Manager) Error() error {
   570  	m.errLock.Lock()
   571  	defer m.errLock.Unlock()
   572  
   573  	return m.fatalError
   574  }
   575  
   576  // Wait blocks until one of the following occurs:
   577  // - sync is complete.
   578  // - sync fatally errored.
   579  // - [ctx] is canceled.
   580  // If [ctx] is canceled, returns [ctx].Err().
   581  func (m *Manager) Wait(ctx context.Context) error {
   582  	select {
   583  	case <-m.doneChan:
   584  	case <-ctx.Done():
   585  		return ctx.Err()
   586  	}
   587  
   588  	// There was a fatal error.
   589  	if err := m.Error(); err != nil {
   590  		return err
   591  	}
   592  
   593  	root, err := m.config.DB.GetMerkleRoot(ctx)
   594  	if err != nil {
   595  		return err
   596  	}
   597  
   598  	if targetRootID := m.getTargetRoot(); targetRootID != root {
   599  		// This should never happen.
   600  		return fmt.Errorf("%w: expected %s, got %s", ErrFinishedWithUnexpectedRoot, targetRootID, root)
   601  	}
   602  
   603  	m.config.Log.Info("completed", zap.Stringer("root", root))
   604  	return nil
   605  }
   606  
   607  func (m *Manager) UpdateSyncTarget(syncTargetRoot ids.ID) error {
   608  	m.syncTargetLock.Lock()
   609  	defer m.syncTargetLock.Unlock()
   610  
   611  	m.workLock.Lock()
   612  	defer m.workLock.Unlock()
   613  
   614  	select {
   615  	case <-m.doneChan:
   616  		return ErrAlreadyClosed
   617  	default:
   618  	}
   619  
   620  	if m.config.TargetRoot == syncTargetRoot {
   621  		// the target hasn't changed, so there is nothing to do
   622  		return nil
   623  	}
   624  
   625  	m.config.Log.Debug("updated sync target", zap.Stringer("target", syncTargetRoot))
   626  	m.config.TargetRoot = syncTargetRoot
   627  
   628  	// move all completed ranges into the work heap with high priority
   629  	shouldSignal := m.processedWork.Len() > 0
   630  	for m.processedWork.Len() > 0 {
   631  		// Note that [m.processedWork].Close() hasn't
   632  		// been called because we have [m.workLock]
   633  		// and we checked that [m.closed] is false.
   634  		currentItem := m.processedWork.GetWork()
   635  		currentItem.priority = highPriority
   636  		m.unprocessedWork.Insert(currentItem)
   637  	}
   638  	if shouldSignal {
   639  		// Only signal once because we only have 1 goroutine
   640  		// waiting on [m.unprocessedWorkCond].
   641  		m.unprocessedWorkCond.Signal()
   642  	}
   643  	return nil
   644  }
   645  
   646  func (m *Manager) getTargetRoot() ids.ID {
   647  	m.syncTargetLock.RLock()
   648  	defer m.syncTargetLock.RUnlock()
   649  
   650  	return m.config.TargetRoot
   651  }
   652  
   653  // Record that there was a fatal error and begin shutting down.
   654  func (m *Manager) setError(err error) {
   655  	m.errLock.Lock()
   656  	defer m.errLock.Unlock()
   657  
   658  	m.config.Log.Error("sync errored", zap.Error(err))
   659  	m.fatalError = err
   660  	// Call in goroutine because we might be holding [m.workLock]
   661  	// which [m.Close] will try to acquire.
   662  	go m.Close()
   663  }
   664  
   665  // Mark that we've fetched all the key-value pairs in the range
   666  // [workItem.start, largestHandledKey] for the trie with root [rootID].
   667  //
   668  // If [workItem.start] is Nothing, then we've fetched all the key-value
   669  // pairs up to and including [largestHandledKey].
   670  //
   671  // If [largestHandledKey] is Nothing, then we've fetched all the key-value
   672  // pairs at and after [workItem.start].
   673  //
   674  // [proofOfLargestKey] is the end proof for the range/change proof
   675  // that gave us the range up to and including [largestHandledKey].
   676  //
   677  // Assumes [m.workLock] is not held.
   678  func (m *Manager) completeWorkItem(ctx context.Context, work *workItem, largestHandledKey maybe.Maybe[[]byte], rootID ids.ID, proofOfLargestKey []merkledb.ProofNode) {
   679  	if !maybe.Equal(largestHandledKey, work.end, bytes.Equal) {
   680  		// The largest handled key isn't equal to the end of the work item.
   681  		// Find the start of the next key range to fetch.
   682  		// Note that [largestHandledKey] can't be Nothing.
   683  		// Proof: Suppose it is. That means that we got a range/change proof that proved up to the
   684  		// greatest key-value pair in the database. That means we requested a proof with no upper
   685  		// bound. That is, [workItem.end] is Nothing. Since we're here, [bothNothing] is false,
   686  		// which means [workItem.end] isn't Nothing. Contradiction.
   687  		nextStartKey, err := m.findNextKey(ctx, largestHandledKey.Value(), work.end, proofOfLargestKey)
   688  		if err != nil {
   689  			m.setError(err)
   690  			return
   691  		}
   692  
   693  		// nextStartKey being Nothing indicates that the entire range has been completed
   694  		if nextStartKey.IsNothing() {
   695  			largestHandledKey = work.end
   696  		} else {
   697  			// the full range wasn't completed, so enqueue a new work item for the range [nextStartKey, workItem.end]
   698  			m.enqueueWork(newWorkItem(work.localRootID, nextStartKey, work.end, work.priority))
   699  			largestHandledKey = nextStartKey
   700  		}
   701  	}
   702  
   703  	// Process [work] while holding [syncTargetLock] to ensure that object
   704  	// is added to the right queue, even if a target update is triggered
   705  	m.syncTargetLock.RLock()
   706  	defer m.syncTargetLock.RUnlock()
   707  
   708  	stale := m.config.TargetRoot != rootID
   709  	if stale {
   710  		// the root has changed, so reinsert with high priority
   711  		m.enqueueWork(newWorkItem(rootID, work.start, largestHandledKey, highPriority))
   712  	} else {
   713  		m.workLock.Lock()
   714  		defer m.workLock.Unlock()
   715  
   716  		m.processedWork.MergeInsert(newWorkItem(rootID, work.start, largestHandledKey, work.priority))
   717  	}
   718  
   719  	// completed the range [work.start, lastKey], log and record in the completed work heap
   720  	m.config.Log.Debug("completed range",
   721  		zap.Stringer("start", work.start),
   722  		zap.Stringer("end", largestHandledKey),
   723  		zap.Stringer("rootID", rootID),
   724  		zap.Bool("stale", stale),
   725  	)
   726  }
   727  
   728  // Queue the given key range to be fetched and applied.
   729  // If there are sufficiently few unprocessed/processing work items,
   730  // splits the range into two items and queues them both.
   731  // Assumes [m.workLock] is not held.
   732  func (m *Manager) enqueueWork(work *workItem) {
   733  	m.workLock.Lock()
   734  	defer func() {
   735  		m.workLock.Unlock()
   736  		m.unprocessedWorkCond.Signal()
   737  	}()
   738  
   739  	if m.processingWorkItems+m.unprocessedWork.Len() > 2*m.config.SimultaneousWorkLimit {
   740  		// There are too many work items already, don't split the range
   741  		m.unprocessedWork.Insert(work)
   742  		return
   743  	}
   744  
   745  	// Split the remaining range into to 2.
   746  	// Find the middle point.
   747  	mid := midPoint(work.start, work.end)
   748  
   749  	if maybe.Equal(work.start, mid, bytes.Equal) || maybe.Equal(mid, work.end, bytes.Equal) {
   750  		// The range is too small to split.
   751  		// If we didn't have this check we would add work items
   752  		// [start, start] and [start, end]. Since start <= end, this would
   753  		// violate the invariant of [m.unprocessedWork] and [m.processedWork]
   754  		// that there are no overlapping ranges.
   755  		m.unprocessedWork.Insert(work)
   756  		return
   757  	}
   758  
   759  	// first item gets higher priority than the second to encourage finished ranges to grow
   760  	// rather than start a new range that is not contiguous with existing completed ranges
   761  	first := newWorkItem(work.localRootID, work.start, mid, medPriority)
   762  	second := newWorkItem(work.localRootID, mid, work.end, lowPriority)
   763  
   764  	m.unprocessedWork.Insert(first)
   765  	m.unprocessedWork.Insert(second)
   766  }
   767  
   768  // find the midpoint between two keys
   769  // start is expected to be less than end
   770  // Nothing/nil [start] is treated as all 0's
   771  // Nothing/nil [end] is treated as all 255's
   772  func midPoint(startMaybe, endMaybe maybe.Maybe[[]byte]) maybe.Maybe[[]byte] {
   773  	start := startMaybe.Value()
   774  	end := endMaybe.Value()
   775  	length := len(start)
   776  	if len(end) > length {
   777  		length = len(end)
   778  	}
   779  
   780  	if length == 0 {
   781  		if endMaybe.IsNothing() {
   782  			return maybe.Some([]byte{127})
   783  		} else if len(end) == 0 {
   784  			return maybe.Nothing[[]byte]()
   785  		}
   786  	}
   787  
   788  	// This check deals with cases where the end has a 255(or is nothing which is treated as all 255s) and the start key ends 255.
   789  	// For example, midPoint([255], nothing) should be [255, 127], not [255].
   790  	// The result needs the extra byte added on to the end to deal with the fact that the naive midpoint between 255 and 255 would be 255
   791  	if (len(start) > 0 && start[len(start)-1] == 255) && (len(end) == 0 || end[len(end)-1] == 255) {
   792  		length++
   793  	}
   794  
   795  	leftover := 0
   796  	midpoint := make([]byte, length+1)
   797  	for i := 0; i < length; i++ {
   798  		startVal := 0
   799  		if i < len(start) {
   800  			startVal = int(start[i])
   801  		}
   802  
   803  		endVal := 0
   804  		if endMaybe.IsNothing() {
   805  			endVal = 255
   806  		}
   807  		if i < len(end) {
   808  			endVal = int(end[i])
   809  		}
   810  
   811  		total := startVal + endVal + leftover
   812  		leftover = 0
   813  		// if total is odd, when we divide, we will lose the .5,
   814  		// record that in the leftover for the next digits
   815  		if total%2 == 1 {
   816  			leftover = 256
   817  		}
   818  
   819  		// find the midpoint between the start and the end
   820  		total /= 2
   821  
   822  		// larger than byte can hold, so carry over to previous byte
   823  		if total >= 256 {
   824  			total -= 256
   825  			index := i - 1
   826  			for index > 0 && midpoint[index] == 255 {
   827  				midpoint[index] = 0
   828  				index--
   829  			}
   830  			midpoint[index]++
   831  		}
   832  		midpoint[i] = byte(total)
   833  	}
   834  	if leftover > 0 {
   835  		midpoint[length] = 127
   836  	} else {
   837  		midpoint = midpoint[0:length]
   838  	}
   839  	return maybe.Some(midpoint)
   840  }
   841  
   842  // findChildDifference returns the first child index that is different between node 1 and node 2 if one exists and
   843  // a bool indicating if any difference was found
   844  func findChildDifference(node1, node2 *merkledb.ProofNode, startIndex int) (byte, bool) {
   845  	// Children indices >= [startIndex] present in at least one of the nodes.
   846  	childIndices := set.Set[byte]{}
   847  	for _, node := range []*merkledb.ProofNode{node1, node2} {
   848  		if node == nil {
   849  			continue
   850  		}
   851  		for key := range node.Children {
   852  			if int(key) >= startIndex {
   853  				childIndices.Add(key)
   854  			}
   855  		}
   856  	}
   857  
   858  	sortedChildIndices := maps.Keys(childIndices)
   859  	slices.Sort(sortedChildIndices)
   860  	var (
   861  		child1, child2 ids.ID
   862  		ok1, ok2       bool
   863  	)
   864  	for _, childIndex := range sortedChildIndices {
   865  		if node1 != nil {
   866  			child1, ok1 = node1.Children[childIndex]
   867  		}
   868  		if node2 != nil {
   869  			child2, ok2 = node2.Children[childIndex]
   870  		}
   871  		// if one node has a child and the other doesn't or the children ids don't match,
   872  		// return the current child index as the first difference
   873  		if (ok1 || ok2) && child1 != child2 {
   874  			return childIndex, true
   875  		}
   876  	}
   877  	// there were no differences found
   878  	return 0, false
   879  }