github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/epochs/qc_voter.go (about) 1 package epochs 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "sync" 8 "time" 9 10 "github.com/onflow/flow-go/module/retrymiddleware" 11 "github.com/onflow/flow-go/network" 12 13 "github.com/sethvargo/go-retry" 14 15 "github.com/rs/zerolog" 16 17 "github.com/onflow/flow-go/consensus/hotstuff" 18 hotmodel "github.com/onflow/flow-go/consensus/hotstuff/model" 19 "github.com/onflow/flow-go/model/flow" 20 "github.com/onflow/flow-go/module" 21 clusterstate "github.com/onflow/flow-go/state/cluster" 22 "github.com/onflow/flow-go/state/protocol" 23 ) 24 25 const ( 26 // retryDuration is the initial duration to wait between retries for all retryable 27 // requests - increases exponentially for subsequent retries 28 retryDuration = time.Second 29 30 // update qc contract client after 2 consecutive failures 31 retryMaxConsecutiveFailures = 2 32 33 // retryDurationMax is the maximum duration to wait between two consecutive requests 34 retryDurationMax = 10 * time.Minute 35 36 // retryJitterPercent is the percentage jitter to introduce to each retry interval 37 retryJitterPercent = 25 // 25% 38 ) 39 40 // RootQCVoter is responsible for generating and submitting votes for the 41 // root quorum certificate of the upcoming epoch for this node's cluster. 42 type RootQCVoter struct { 43 log zerolog.Logger 44 me module.Local 45 signer hotstuff.Signer 46 state protocol.State 47 qcContractClients []module.QCContractClient // priority ordered array of client to the QC aggregator smart contract 48 lastSuccessfulClientIndex int // index of the contract client that was last successful during retries 49 wait time.Duration // how long to sleep in between vote attempts 50 mu sync.Mutex 51 } 52 53 // NewRootQCVoter returns a new root QC voter, configured for a particular epoch. 54 func NewRootQCVoter( 55 log zerolog.Logger, 56 me module.Local, 57 signer hotstuff.Signer, 58 state protocol.State, 59 contractClients []module.QCContractClient, 60 ) *RootQCVoter { 61 62 voter := &RootQCVoter{ 63 log: log.With().Str("module", "root_qc_voter").Logger(), 64 me: me, 65 signer: signer, 66 state: state, 67 qcContractClients: contractClients, 68 wait: time.Second * 10, 69 mu: sync.Mutex{}, 70 } 71 return voter 72 } 73 74 // Vote handles the full procedure of generating a vote, submitting it to the 75 // epoch smart contract, and verifying submission. 76 // It is safe to run multiple times within a single setup phase. 77 // 78 // Error returns: 79 // - ErrWontVote if we fail to vote for a benign reason 80 // - generic error in case of critical unexpected failure 81 func (voter *RootQCVoter) Vote(ctx context.Context, epoch protocol.Epoch) error { 82 83 counter, err := epoch.Counter() 84 if err != nil { 85 return fmt.Errorf("could not get epoch counter: %w", err) 86 } 87 clusters, err := epoch.Clustering() 88 if err != nil { 89 return fmt.Errorf("could not get clustering: %w", err) 90 } 91 cluster, clusterIndex, ok := clusters.ByNodeID(voter.me.NodeID()) 92 if !ok { 93 return NewClusterQCNoVoteErrorf("could not find self in clustering") 94 } 95 96 log := voter.log.With(). 97 Uint64("epoch", counter). 98 Uint("cluster_index", clusterIndex). 99 Logger() 100 101 log.Info().Msg("preparing to generate vote for cluster root qc") 102 103 // create the canonical root block for our cluster 104 root := clusterstate.CanonicalRootBlock(counter, cluster) 105 // create a signable hotstuff model 106 signable := hotmodel.GenesisBlockFromFlow(root.Header) 107 108 vote, err := voter.signer.CreateVote(signable) 109 if err != nil { 110 return fmt.Errorf("could not create vote for cluster root qc: %w", err) 111 } 112 113 // this backoff configuration will never terminate on its own, but the 114 // request logic will exit when we exit the EpochSetup phase 115 backoff := retry.NewExponential(retryDuration) 116 backoff = retry.WithCappedDuration(retryDurationMax, backoff) 117 backoff = retry.WithJitterPercent(retryJitterPercent, backoff) 118 119 clientIndex, qcContractClient := voter.getInitialContractClient() 120 onMaxConsecutiveRetries := func(totalAttempts int) { 121 clientIndex, qcContractClient = voter.updateContractClient(clientIndex) 122 log.Warn().Msgf("retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex) 123 } 124 backoff = retrymiddleware.AfterConsecutiveFailures(retryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries) 125 126 err = retry.Do(ctx, backoff, func(ctx context.Context) error { 127 // check that we're still in the setup phase, if we're not we can't 128 // submit a vote anyway and must exit this process 129 phase, err := voter.state.Final().Phase() 130 if err != nil { 131 return fmt.Errorf("unexpected error - unable to get current epoch phase: %w", err) 132 } else if phase != flow.EpochPhaseSetup { 133 return NewClusterQCNoVoteErrorf("could not submit vote because we we are not in EpochSetup phase (in %s phase instead)", phase) 134 } 135 136 // check whether we've already voted, if we have we can exit early 137 voted, err := qcContractClient.Voted(ctx) 138 if err != nil { 139 if network.IsTransientError(err) { 140 log.Warn().Err(err).Msg("unable to check vote status, retrying...") 141 return retry.RetryableError(err) 142 } 143 return fmt.Errorf("unexpected error in Voted script execution: %w", err) 144 } else if voted { 145 log.Info().Msg("already voted - exiting QC vote process...") 146 // update our last successful client index for future calls 147 voter.updateLastSuccessfulClient(clientIndex) 148 return nil 149 } 150 151 // submit the vote - this call will block until the transaction has 152 // either succeeded or we are able to retry 153 log.Info().Msg("submitting vote...") 154 err = qcContractClient.SubmitVote(ctx, vote) 155 if err != nil { 156 if network.IsTransientError(err) || errors.Is(err, errTransactionExpired) { 157 log.Warn().Err(err).Msg("could not submit vote due to transient failure - retrying...") 158 return retry.RetryableError(err) 159 } else if errors.Is(err, errTransactionReverted) { 160 // this error case could be benign or not - if we observe it, we should investigate further 161 log.Err(err).Msg("vote submission failed due to execution failure - caution: this could be either a benign error (eg. 'already voted') or a critical bug - retrying") 162 return retry.RetryableError(err) 163 } else { 164 return fmt.Errorf("unexpected error submitting vote: %w", err) 165 } 166 } 167 168 log.Info().Msg("successfully submitted vote - exiting QC vote process...") 169 170 // update our last successful client index for future calls 171 voter.updateLastSuccessfulClient(clientIndex) 172 return nil 173 }) 174 if network.IsTransientError(err) || errors.Is(err, errTransactionReverted) || errors.Is(err, errTransactionReverted) { 175 return NewClusterQCNoVoteErrorf("exceeded retry limit without successfully submitting our vote: %w", err) 176 } 177 return err 178 } 179 180 // updateContractClient will return the last successful client index by default for all initial operations or else 181 // it will return the appropriate client index with respect to last successful and number of client. 182 func (voter *RootQCVoter) updateContractClient(clientIndex int) (int, module.QCContractClient) { 183 voter.mu.Lock() 184 defer voter.mu.Unlock() 185 if clientIndex == voter.lastSuccessfulClientIndex { 186 if clientIndex == len(voter.qcContractClients)-1 { 187 clientIndex = 0 188 } else { 189 clientIndex++ 190 } 191 } else { 192 clientIndex = voter.lastSuccessfulClientIndex 193 } 194 195 return clientIndex, voter.qcContractClients[clientIndex] 196 } 197 198 // getInitialContractClient will return the last successful contract client or the initial 199 func (voter *RootQCVoter) getInitialContractClient() (int, module.QCContractClient) { 200 voter.mu.Lock() 201 defer voter.mu.Unlock() 202 return voter.lastSuccessfulClientIndex, voter.qcContractClients[voter.lastSuccessfulClientIndex] 203 } 204 205 // updateLastSuccessfulClient set lastSuccessfulClientIndex in concurrency safe way 206 func (voter *RootQCVoter) updateLastSuccessfulClient(clientIndex int) { 207 voter.mu.Lock() 208 defer voter.mu.Unlock() 209 210 voter.lastSuccessfulClientIndex = clientIndex 211 }