github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/epochs/qc_voter.go (about)

     1  package epochs
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/onflow/flow-go/module/retrymiddleware"
    11  	"github.com/onflow/flow-go/network"
    12  
    13  	"github.com/sethvargo/go-retry"
    14  
    15  	"github.com/rs/zerolog"
    16  
    17  	"github.com/onflow/flow-go/consensus/hotstuff"
    18  	hotmodel "github.com/onflow/flow-go/consensus/hotstuff/model"
    19  	"github.com/onflow/flow-go/model/flow"
    20  	"github.com/onflow/flow-go/module"
    21  	clusterstate "github.com/onflow/flow-go/state/cluster"
    22  	"github.com/onflow/flow-go/state/protocol"
    23  )
    24  
    25  const (
    26  	// retryDuration is the initial duration to wait between retries for all retryable
    27  	// requests - increases exponentially for subsequent retries
    28  	retryDuration = time.Second
    29  
    30  	// update qc contract client after 2 consecutive failures
    31  	retryMaxConsecutiveFailures = 2
    32  
    33  	// retryDurationMax is the maximum duration to wait between two consecutive requests
    34  	retryDurationMax = 10 * time.Minute
    35  
    36  	// retryJitterPercent is the percentage jitter to introduce to each retry interval
    37  	retryJitterPercent = 25 // 25%
    38  )
    39  
    40  // RootQCVoter is responsible for generating and submitting votes for the
    41  // root quorum certificate of the upcoming epoch for this node's cluster.
    42  type RootQCVoter struct {
    43  	log                       zerolog.Logger
    44  	me                        module.Local
    45  	signer                    hotstuff.Signer
    46  	state                     protocol.State
    47  	qcContractClients         []module.QCContractClient // priority ordered array of client to the QC aggregator smart contract
    48  	lastSuccessfulClientIndex int                       // index of the contract client that was last successful during retries
    49  	wait                      time.Duration             // how long to sleep in between vote attempts
    50  	mu                        sync.Mutex
    51  }
    52  
    53  // NewRootQCVoter returns a new root QC voter, configured for a particular epoch.
    54  func NewRootQCVoter(
    55  	log zerolog.Logger,
    56  	me module.Local,
    57  	signer hotstuff.Signer,
    58  	state protocol.State,
    59  	contractClients []module.QCContractClient,
    60  ) *RootQCVoter {
    61  
    62  	voter := &RootQCVoter{
    63  		log:               log.With().Str("module", "root_qc_voter").Logger(),
    64  		me:                me,
    65  		signer:            signer,
    66  		state:             state,
    67  		qcContractClients: contractClients,
    68  		wait:              time.Second * 10,
    69  		mu:                sync.Mutex{},
    70  	}
    71  	return voter
    72  }
    73  
    74  // Vote handles the full procedure of generating a vote, submitting it to the
    75  // epoch smart contract, and verifying submission.
    76  // It is safe to run multiple times within a single setup phase.
    77  //
    78  // Error returns:
    79  //   - ErrWontVote if we fail to vote for a benign reason
    80  //   - generic error in case of critical unexpected failure
    81  func (voter *RootQCVoter) Vote(ctx context.Context, epoch protocol.Epoch) error {
    82  
    83  	counter, err := epoch.Counter()
    84  	if err != nil {
    85  		return fmt.Errorf("could not get epoch counter: %w", err)
    86  	}
    87  	clusters, err := epoch.Clustering()
    88  	if err != nil {
    89  		return fmt.Errorf("could not get clustering: %w", err)
    90  	}
    91  	cluster, clusterIndex, ok := clusters.ByNodeID(voter.me.NodeID())
    92  	if !ok {
    93  		return NewClusterQCNoVoteErrorf("could not find self in clustering")
    94  	}
    95  
    96  	log := voter.log.With().
    97  		Uint64("epoch", counter).
    98  		Uint("cluster_index", clusterIndex).
    99  		Logger()
   100  
   101  	log.Info().Msg("preparing to generate vote for cluster root qc")
   102  
   103  	// create the canonical root block for our cluster
   104  	root := clusterstate.CanonicalRootBlock(counter, cluster)
   105  	// create a signable hotstuff model
   106  	signable := hotmodel.GenesisBlockFromFlow(root.Header)
   107  
   108  	vote, err := voter.signer.CreateVote(signable)
   109  	if err != nil {
   110  		return fmt.Errorf("could not create vote for cluster root qc: %w", err)
   111  	}
   112  
   113  	// this backoff configuration will never terminate on its own, but the
   114  	// request logic will exit when we exit the EpochSetup phase
   115  	backoff := retry.NewExponential(retryDuration)
   116  	backoff = retry.WithCappedDuration(retryDurationMax, backoff)
   117  	backoff = retry.WithJitterPercent(retryJitterPercent, backoff)
   118  
   119  	clientIndex, qcContractClient := voter.getInitialContractClient()
   120  	onMaxConsecutiveRetries := func(totalAttempts int) {
   121  		clientIndex, qcContractClient = voter.updateContractClient(clientIndex)
   122  		log.Warn().Msgf("retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex)
   123  	}
   124  	backoff = retrymiddleware.AfterConsecutiveFailures(retryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries)
   125  
   126  	err = retry.Do(ctx, backoff, func(ctx context.Context) error {
   127  		// check that we're still in the setup phase, if we're not we can't
   128  		// submit a vote anyway and must exit this process
   129  		phase, err := voter.state.Final().Phase()
   130  		if err != nil {
   131  			return fmt.Errorf("unexpected error - unable to get current epoch phase: %w", err)
   132  		} else if phase != flow.EpochPhaseSetup {
   133  			return NewClusterQCNoVoteErrorf("could not submit vote because we we are not in EpochSetup phase (in %s phase instead)", phase)
   134  		}
   135  
   136  		// check whether we've already voted, if we have we can exit early
   137  		voted, err := qcContractClient.Voted(ctx)
   138  		if err != nil {
   139  			if network.IsTransientError(err) {
   140  				log.Warn().Err(err).Msg("unable to check vote status, retrying...")
   141  				return retry.RetryableError(err)
   142  			}
   143  			return fmt.Errorf("unexpected error in Voted script execution: %w", err)
   144  		} else if voted {
   145  			log.Info().Msg("already voted - exiting QC vote process...")
   146  			// update our last successful client index for future calls
   147  			voter.updateLastSuccessfulClient(clientIndex)
   148  			return nil
   149  		}
   150  
   151  		// submit the vote - this call will block until the transaction has
   152  		// either succeeded or we are able to retry
   153  		log.Info().Msg("submitting vote...")
   154  		err = qcContractClient.SubmitVote(ctx, vote)
   155  		if err != nil {
   156  			if network.IsTransientError(err) || errors.Is(err, errTransactionExpired) {
   157  				log.Warn().Err(err).Msg("could not submit vote due to transient failure - retrying...")
   158  				return retry.RetryableError(err)
   159  			} else if errors.Is(err, errTransactionReverted) {
   160  				// this error case could be benign or not - if we observe it, we should investigate further
   161  				log.Err(err).Msg("vote submission failed due to execution failure - caution: this could be either a benign error (eg. 'already voted') or a critical bug - retrying")
   162  				return retry.RetryableError(err)
   163  			} else {
   164  				return fmt.Errorf("unexpected error submitting vote: %w", err)
   165  			}
   166  		}
   167  
   168  		log.Info().Msg("successfully submitted vote - exiting QC vote process...")
   169  
   170  		// update our last successful client index for future calls
   171  		voter.updateLastSuccessfulClient(clientIndex)
   172  		return nil
   173  	})
   174  	if network.IsTransientError(err) || errors.Is(err, errTransactionReverted) || errors.Is(err, errTransactionReverted) {
   175  		return NewClusterQCNoVoteErrorf("exceeded retry limit without successfully submitting our vote: %w", err)
   176  	}
   177  	return err
   178  }
   179  
   180  // updateContractClient will return the last successful client index by default for all initial operations or else
   181  // it will return the appropriate client index with respect to last successful and number of client.
   182  func (voter *RootQCVoter) updateContractClient(clientIndex int) (int, module.QCContractClient) {
   183  	voter.mu.Lock()
   184  	defer voter.mu.Unlock()
   185  	if clientIndex == voter.lastSuccessfulClientIndex {
   186  		if clientIndex == len(voter.qcContractClients)-1 {
   187  			clientIndex = 0
   188  		} else {
   189  			clientIndex++
   190  		}
   191  	} else {
   192  		clientIndex = voter.lastSuccessfulClientIndex
   193  	}
   194  
   195  	return clientIndex, voter.qcContractClients[clientIndex]
   196  }
   197  
   198  // getInitialContractClient will return the last successful contract client or the initial
   199  func (voter *RootQCVoter) getInitialContractClient() (int, module.QCContractClient) {
   200  	voter.mu.Lock()
   201  	defer voter.mu.Unlock()
   202  	return voter.lastSuccessfulClientIndex, voter.qcContractClients[voter.lastSuccessfulClientIndex]
   203  }
   204  
   205  // updateLastSuccessfulClient set lastSuccessfulClientIndex in concurrency safe way
   206  func (voter *RootQCVoter) updateLastSuccessfulClient(clientIndex int) {
   207  	voter.mu.Lock()
   208  	defer voter.mu.Unlock()
   209  
   210  	voter.lastSuccessfulClientIndex = clientIndex
   211  }