github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/consensus/hotstuff/safetyrules/safety_rules.go (about)

     1  package safetyrules
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/onflow/flow-go/consensus/hotstuff"
     7  	"github.com/onflow/flow-go/consensus/hotstuff/model"
     8  	"github.com/onflow/flow-go/model/flow"
     9  )
    10  
    11  // SafetyRules is a dedicated module that enforces consensus safety. This component has the sole authority to generate
    12  // votes and timeouts. It follows voting and timeout rules for creating votes and timeouts respectively.
    13  // Caller can be sure that created vote or timeout doesn't break safety and can be used in consensus process.
    14  // SafetyRules relies on hotstuff.Persister to store latest state of hotstuff.SafetyData.
    15  //
    16  // The voting rules implemented by SafetyRules are:
    17  //  1. Replicas vote strictly in increasing rounds
    18  //  2. Each block has to include a TC or a QC from the previous round.
    19  //     a. [Happy path] If the previous round resulted in a QC then new QC should extend it.
    20  //     b. [Recovery path] If the previous round did *not* result in a QC, the leader of the
    21  //     subsequent round *must* include a valid TC for the previous round in its block.
    22  //
    23  // NOT safe for concurrent use.
    24  type SafetyRules struct {
    25  	signer     hotstuff.Signer
    26  	persist    hotstuff.Persister
    27  	committee  hotstuff.DynamicCommittee // only produce votes when we are valid committee members
    28  	safetyData *hotstuff.SafetyData
    29  }
    30  
    31  var _ hotstuff.SafetyRules = (*SafetyRules)(nil)
    32  
    33  // New creates a new SafetyRules instance
    34  func New(
    35  	signer hotstuff.Signer,
    36  	persist hotstuff.Persister,
    37  	committee hotstuff.DynamicCommittee,
    38  ) (*SafetyRules, error) {
    39  	// get the last stored safety data
    40  	safetyData, err := persist.GetSafetyData()
    41  	if err != nil {
    42  		return nil, fmt.Errorf("could not recover safety data: %w", err)
    43  	}
    44  
    45  	return &SafetyRules{
    46  		signer:     signer,
    47  		persist:    persist,
    48  		committee:  committee,
    49  		safetyData: safetyData,
    50  	}, nil
    51  }
    52  
    53  // ProduceVote will make a decision on whether it will vote for the given proposal, the returned
    54  // error indicates whether to vote or not.
    55  // To ensure that only safe proposals are being voted on, we check that the proposer is a valid committee member and that the
    56  // proposal complies with voting rules.
    57  // We expect that only well-formed proposals with valid signatures are submitted for voting.
    58  // The curView is taken as input to ensure SafetyRules will only vote for proposals at current view and prevent double voting.
    59  // Returns:
    60  //   - (vote, nil): On the _first_ block for the current view that is safe to vote for.
    61  //     Subsequently, voter does _not_ vote for any other block with the same (or lower) view.
    62  //   - (nil, model.NoVoteError): If the voter decides that it does not want to vote for the given block.
    63  //     This is a sentinel error and _expected_ during normal operation.
    64  //
    65  // All other errors are unexpected and potential symptoms of uncovered edge cases or corrupted internal state (fatal).
    66  func (r *SafetyRules) ProduceVote(proposal *model.Proposal, curView uint64) (*model.Vote, error) {
    67  	block := proposal.Block
    68  	// sanity checks:
    69  	if curView != block.View {
    70  		return nil, fmt.Errorf("expecting block for current view %d, but block's view is %d", curView, block.View)
    71  	}
    72  
    73  	err := r.IsSafeToVote(proposal)
    74  	if err != nil {
    75  		return nil, fmt.Errorf("not safe to vote for proposal %x: %w", proposal.Block.BlockID, err)
    76  	}
    77  
    78  	// we expect that only valid proposals are submitted for voting
    79  	// we need to make sure that proposer is not ejected to decide to vote or not
    80  	_, err = r.committee.IdentityByBlock(block.BlockID, block.ProposerID)
    81  	if model.IsInvalidSignerError(err) {
    82  		// the proposer must be ejected since the proposal has already been validated,
    83  		// which ensures that the proposer was a valid committee member at the start of the epoch
    84  		return nil, model.NewNoVoteErrorf("proposer ejected: %w", err)
    85  	}
    86  	if err != nil {
    87  		return nil, fmt.Errorf("internal error retrieving Identity of proposer %x at block %x: %w", block.ProposerID, block.BlockID, err)
    88  	}
    89  
    90  	// Do not produce a vote for blocks where we are not a valid committee member.
    91  	// HotStuff will ask for a vote for the first block of the next epoch, even if we
    92  	// have zero weight in the next epoch. Such vote can't be used to produce valid QCs.
    93  	_, err = r.committee.IdentityByBlock(block.BlockID, r.committee.Self())
    94  	if model.IsInvalidSignerError(err) {
    95  		return nil, model.NewNoVoteErrorf("I am not authorized to vote for block %x: %w", block.BlockID, err)
    96  	}
    97  	if err != nil {
    98  		return nil, fmt.Errorf("could not get self identity: %w", err)
    99  	}
   100  
   101  	vote, err := r.signer.CreateVote(block)
   102  	if err != nil {
   103  		return nil, fmt.Errorf("could not vote for block: %w", err)
   104  	}
   105  
   106  	// vote for the current view has been produced, update safetyData
   107  	r.safetyData.HighestAcknowledgedView = curView
   108  	if r.safetyData.LockedOneChainView < block.QC.View {
   109  		r.safetyData.LockedOneChainView = block.QC.View
   110  	}
   111  
   112  	err = r.persist.PutSafetyData(r.safetyData)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("could not persist safety data: %w", err)
   115  	}
   116  
   117  	return vote, nil
   118  }
   119  
   120  // ProduceTimeout takes current view, highest locally known QC and TC (optional, must be nil if and
   121  // only if QC is for previous view) and decides whether to produce timeout for current view.
   122  // Returns:
   123  //   - (timeout, nil): It is safe to timeout for current view using newestQC and lastViewTC.
   124  //   - (nil, model.NoTimeoutError): If replica is not part of the authorized consensus committee (anymore) and
   125  //     therefore is not authorized to produce a valid timeout object. This sentinel error is _expected_ during
   126  //     normal operation, e.g. during the grace-period after Epoch switchover or after the replica self-ejected.
   127  //
   128  // All other errors are unexpected and potential symptoms of uncovered edge cases or corrupted internal state (fatal).
   129  func (r *SafetyRules) ProduceTimeout(curView uint64, newestQC *flow.QuorumCertificate, lastViewTC *flow.TimeoutCertificate) (*model.TimeoutObject, error) {
   130  	lastTimeout := r.safetyData.LastTimeout
   131  	if lastTimeout != nil && lastTimeout.View == curView {
   132  		// model.TimeoutObject are conceptually immutable, hence we create a shallow copy here, which allows us to increment TimeoutTick
   133  		updatedTimeout := *lastTimeout
   134  		updatedTimeout.TimeoutTick += 1
   135  
   136  		// persist updated TimeoutObject in `safetyData` and return it
   137  		r.safetyData.LastTimeout = &updatedTimeout
   138  		err := r.persist.PutSafetyData(r.safetyData)
   139  		if err != nil {
   140  			return nil, fmt.Errorf("could not persist safety data: %w", err)
   141  		}
   142  		return r.safetyData.LastTimeout, nil
   143  	}
   144  
   145  	err := r.IsSafeToTimeout(curView, newestQC, lastViewTC)
   146  	if err != nil {
   147  		return nil, fmt.Errorf("local, trusted inputs failed safety rules: %w", err)
   148  	}
   149  
   150  	// Do not produce a timeout for view where we are not a valid committee member.
   151  	_, err = r.committee.IdentityByEpoch(curView, r.committee.Self())
   152  	if err != nil {
   153  		if model.IsInvalidSignerError(err) {
   154  			return nil, model.NewNoTimeoutErrorf("I am not authorized to timeout for view %d: %w", curView, err)
   155  		}
   156  		return nil, fmt.Errorf("could not get self identity: %w", err)
   157  	}
   158  
   159  	timeout, err := r.signer.CreateTimeout(curView, newestQC, lastViewTC)
   160  	if err != nil {
   161  		return nil, fmt.Errorf("could not create timeout at view %d: %w", curView, err)
   162  	}
   163  
   164  	r.safetyData.HighestAcknowledgedView = curView
   165  	r.safetyData.LastTimeout = timeout
   166  
   167  	err = r.persist.PutSafetyData(r.safetyData)
   168  	if err != nil {
   169  		return nil, fmt.Errorf("could not persist safety data: %w", err)
   170  	}
   171  
   172  	return timeout, nil
   173  }
   174  
   175  // IsSafeToVote checks if this proposal is valid in terms of voting rules, if voting for this proposal won't break safety rules.
   176  // Expected errors during normal operations:
   177  //   - NoVoteError if replica already acted during this view (either voted or generated timeout)
   178  func (r *SafetyRules) IsSafeToVote(proposal *model.Proposal) error {
   179  	blockView := proposal.Block.View
   180  
   181  	err := r.validateEvidenceForEnteringView(blockView, proposal.Block.QC, proposal.LastViewTC)
   182  	if err != nil {
   183  		// As we are expecting the blocks to be pre-validated, any failure here is a symptom of an internal bug.
   184  		return fmt.Errorf("proposal failed consensus validity check")
   185  	}
   186  
   187  	// This check satisfies voting rule 1
   188  	// 1. Replicas vote strictly in increasing rounds,
   189  	// block's view must be greater than the view that we have voted for
   190  	acView := r.safetyData.HighestAcknowledgedView
   191  	if blockView == acView {
   192  		return model.NewNoVoteErrorf("already voted or generated timeout in view %d", blockView)
   193  	}
   194  	if blockView < acView {
   195  		return fmt.Errorf("already acted during view %d but got proposal for lower view %d", acView, blockView)
   196  	}
   197  
   198  	return nil
   199  }
   200  
   201  // IsSafeToTimeout checks if it's safe to timeout with proposed data, i.e. timing out won't break safety.
   202  // newestQC is the valid QC with the greatest view that we have observed.
   203  // lastViewTC is the TC for the previous view (might be nil).
   204  //
   205  // When generating a timeout, the inputs are provided by node-internal components. Failure to comply with
   206  // the protocol is a symptom of an internal bug. We don't expect any errors during normal operations.
   207  func (r *SafetyRules) IsSafeToTimeout(curView uint64, newestQC *flow.QuorumCertificate, lastViewTC *flow.TimeoutCertificate) error {
   208  	err := r.validateEvidenceForEnteringView(curView, newestQC, lastViewTC)
   209  	if err != nil {
   210  		return fmt.Errorf("not safe to timeout: %w", err)
   211  	}
   212  
   213  	if newestQC.View < r.safetyData.LockedOneChainView {
   214  		return fmt.Errorf("have already seen QC for view %d, but newest QC is reported to be for view %d", r.safetyData.LockedOneChainView, newestQC.View)
   215  	}
   216  	if curView+1 <= r.safetyData.HighestAcknowledgedView {
   217  		return fmt.Errorf("cannot generate timeout for past view %d", curView)
   218  	}
   219  	// the logic for rejecting inputs with `curView <= newestQC.View` is already contained
   220  	// in `validateEvidenceForEnteringView(..)`, because it only passes if
   221  	// * either `curView == newestQC.View + 1` (condition 2)
   222  	// * or `curView > newestQC.View` (condition 4)
   223  
   224  	return nil
   225  }
   226  
   227  // validateEvidenceForEnteringView performs the following check that is fundamental for consensus safety:
   228  // Whenever a replica acts within a view, it must prove that is has sufficient evidence to enter this view
   229  // Specifically:
   230  //  1. The replica must always provide a QC and optionally a TC.
   231  //  2. [Happy Path] If the previous round (i.e. `view -1`) resulted in a QC, the replica is allowed to transition to `view`.
   232  //     The QC from the previous round provides sufficient evidence. Furthermore, to prevent resource-exhaustion attacks,
   233  //     we require that no TC is included as part of the proof.
   234  //  3. Following the Happy Path has priority over following the Recovery Path (specified below).
   235  //  4. [Recovery Path] If the previous round (i.e. `view -1`) did *not* result in a QC, a TC from the previous round
   236  //     is required to transition to `view`. The following additional consistency requirements have to be satisfied:
   237  //     (a) newestQC.View + 1 < view
   238  //     Otherwise, the replica has violated condition 3 (in case newestQC.View + 1 = view); or the replica
   239  //     failed to apply condition 2 (in case newestQC.View + 1 > view).
   240  //     (b) newestQC.View ≥ lastViewTC.NewestQC.View
   241  //     Otherwise, the replica has violated condition 3.
   242  //
   243  // SafetyRules has the sole signing authority and enforces adherence to these conditions. In order to generate valid
   244  // consensus signatures, the replica must provide the respective evidence (required QC + optional TC) to its
   245  // internal SafetyRules component for each consensus action that the replica wants to take:
   246  //   - primary signing its own proposal
   247  //   - replica voting for a block
   248  //   - replica generating a timeout message
   249  //
   250  // During normal operations, no errors are expected:
   251  //   - As we are expecting the blocks to be pre-validated, any failure here is a symptom of an internal bug.
   252  //   - When generating a timeout, the inputs are provided by node-internal components. Failure to comply with
   253  //     the protocol is a symptom of an internal bug.
   254  func (r *SafetyRules) validateEvidenceForEnteringView(view uint64, newestQC *flow.QuorumCertificate, lastViewTC *flow.TimeoutCertificate) error {
   255  	// Condition 1:
   256  	if newestQC == nil {
   257  		return fmt.Errorf("missing the mandatory QC")
   258  	}
   259  
   260  	// Condition 2:
   261  	if newestQC.View+1 == view {
   262  		if lastViewTC != nil {
   263  			return fmt.Errorf("when QC is for prior round, no TC should be provided")
   264  		}
   265  		return nil
   266  	}
   267  	// Condition 3: if we reach the following lines, the happy path is not satisfied.
   268  
   269  	// Condition 4:
   270  	if lastViewTC == nil {
   271  		return fmt.Errorf("expecting TC because QC is not for prior view; but didn't get any TC")
   272  	}
   273  	if lastViewTC.View+1 != view {
   274  		return fmt.Errorf("neither QC (view %d) nor TC (view %d) allows to transition to view %d", newestQC.View, lastViewTC.View, view)
   275  	}
   276  	if newestQC.View >= view {
   277  		// Note: we need to enforce here that `newestQC.View + 1 < view`, i.e. we error for `newestQC.View+1 >= view`
   278  		// However, `newestQC.View+1 == view` is impossible, because otherwise we would have walked into condition 2.
   279  		// Hence, it suffices to error if `newestQC.View+1 > view`, which is identical to `newestQC.View >= view`
   280  		return fmt.Errorf("still at view %d, despite knowing a QC for view %d", view, newestQC.View)
   281  	}
   282  	if newestQC.View < lastViewTC.NewestQC.View {
   283  		return fmt.Errorf("failed to update newest QC (still at view %d) despite a newer QC (view %d) being included in TC", newestQC.View, lastViewTC.NewestQC.View)
   284  	}
   285  
   286  	return nil
   287  }