github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/consensus/hotstuff/timeoutcollector/timeout_processor.go (about)

     1  package timeoutcollector
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  
     7  	"github.com/rs/zerolog"
     8  	"go.uber.org/atomic"
     9  	"golang.org/x/exp/slices"
    10  
    11  	"github.com/onflow/flow-go/consensus/hotstuff"
    12  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    13  	"github.com/onflow/flow-go/consensus/hotstuff/tracker"
    14  	"github.com/onflow/flow-go/model/flow"
    15  	"github.com/onflow/flow-go/module/signature"
    16  )
    17  
    18  // accumulatedWeightTracker tracks one-time event of reaching required weight
    19  // Uses atomic flag to guarantee concurrency safety.
    20  type accumulatedWeightTracker struct {
    21  	minRequiredWeight uint64
    22  	done              atomic.Bool
    23  }
    24  
    25  func (t *accumulatedWeightTracker) Done() bool {
    26  	return t.done.Load()
    27  }
    28  
    29  // Track returns true if `weight` reaches or exceeds `minRequiredWeight` for the _first time_.
    30  // All subsequent calls of `Track` (with any value) return false.
    31  func (t *accumulatedWeightTracker) Track(weight uint64) bool {
    32  	if weight < t.minRequiredWeight {
    33  		return false
    34  	}
    35  	return t.done.CompareAndSwap(false, true)
    36  }
    37  
    38  // TimeoutProcessor implements the hotstuff.TimeoutProcessor interface.
    39  // It processes timeout objects broadcast by other replicas of the consensus committee.
    40  // TimeoutProcessor collects TOs for one view, eventually when enough timeout objects are contributed
    41  // TimeoutProcessor will create a timeout certificate which can be used to advance round.
    42  // Concurrency safe.
    43  type TimeoutProcessor struct {
    44  	log              zerolog.Logger
    45  	view             uint64
    46  	validator        hotstuff.Validator
    47  	committee        hotstuff.Replicas
    48  	sigAggregator    hotstuff.TimeoutSignatureAggregator
    49  	notifier         hotstuff.TimeoutCollectorConsumer
    50  	partialTCTracker accumulatedWeightTracker
    51  	tcTracker        accumulatedWeightTracker
    52  	newestQCTracker  *tracker.NewestQCTracker
    53  }
    54  
    55  var _ hotstuff.TimeoutProcessor = (*TimeoutProcessor)(nil)
    56  
    57  // NewTimeoutProcessor creates new instance of TimeoutProcessor
    58  // Returns the following expected errors for invalid inputs:
    59  //   - model.ErrViewForUnknownEpoch if no epoch containing the given view is known
    60  //
    61  // All other errors should be treated as exceptions.
    62  func NewTimeoutProcessor(log zerolog.Logger,
    63  	committee hotstuff.Replicas,
    64  	validator hotstuff.Validator,
    65  	sigAggregator hotstuff.TimeoutSignatureAggregator,
    66  	notifier hotstuff.TimeoutCollectorConsumer,
    67  ) (*TimeoutProcessor, error) {
    68  	view := sigAggregator.View()
    69  	qcThreshold, err := committee.QuorumThresholdForView(view)
    70  	if err != nil {
    71  		return nil, fmt.Errorf("could not retrieve QC weight threshold for view %d: %w", view, err)
    72  	}
    73  	timeoutThreshold, err := committee.TimeoutThresholdForView(view)
    74  	if err != nil {
    75  		return nil, fmt.Errorf("could not retrieve timeout weight threshold for view %d: %w", view, err)
    76  	}
    77  	return &TimeoutProcessor{
    78  		log: log.With().
    79  			Str("component", "hotstuff.timeout_processor").
    80  			Uint64("view", view).
    81  			Logger(),
    82  		view:      view,
    83  		committee: committee,
    84  		validator: validator,
    85  		notifier:  notifier,
    86  		partialTCTracker: accumulatedWeightTracker{
    87  			minRequiredWeight: timeoutThreshold,
    88  			done:              *atomic.NewBool(false),
    89  		},
    90  		tcTracker: accumulatedWeightTracker{
    91  			minRequiredWeight: qcThreshold,
    92  			done:              *atomic.NewBool(false),
    93  		},
    94  		sigAggregator:   sigAggregator,
    95  		newestQCTracker: tracker.NewNewestQCTracker(),
    96  	}, nil
    97  }
    98  
    99  // Process performs processing of timeout object in concurrent safe way. This
   100  // function is implemented to be called by multiple goroutines at the same time.
   101  // Design of this function is event driven, as soon as we collect enough weight
   102  // to create a TC or a partial TC we will immediately do so and submit it
   103  // via callback for further processing.
   104  // Expected error returns during normal operations:
   105  //   - ErrTimeoutForIncompatibleView - submitted timeout for incompatible view
   106  //   - model.InvalidTimeoutError - submitted invalid timeout(invalid structure or invalid signature)
   107  //   - model.DuplicatedSignerError if a timeout from the same signer was previously already added
   108  //     It does _not necessarily_ imply that the timeout is invalid or the sender is equivocating.
   109  //
   110  // All other errors should be treated as exceptions.
   111  func (p *TimeoutProcessor) Process(timeout *model.TimeoutObject) error {
   112  	if p.view != timeout.View {
   113  		return fmt.Errorf("received incompatible timeout, expected %d got %d: %w", p.view, timeout.View, ErrTimeoutForIncompatibleView)
   114  	}
   115  
   116  	if p.tcTracker.Done() {
   117  		return nil
   118  	}
   119  
   120  	err := p.validateTimeout(timeout)
   121  	if err != nil {
   122  		return fmt.Errorf("validating timeout failed: %w", err)
   123  	}
   124  	if p.tcTracker.Done() {
   125  		return nil
   126  	}
   127  
   128  	// CAUTION: for correctness it is critical that we update the `newestQCTracker` first, _before_ we add the
   129  	// TO's signature to `sigAggregator`. Reasoning:
   130  	//  * For a valid TC, we require that the TC includes a QC with view ≥ max{TC.NewestQCViews}.
   131  	//  * The `NewestQCViews` is maintained by `sigAggregator`.
   132  	//  * Hence, for any view `v ∈ NewestQCViews` that `sigAggregator` knows, a QC with equal or larger view is
   133  	//    known to `newestQCTracker`. This is guaranteed if and only if `newestQCTracker` is updated first.
   134  	p.newestQCTracker.Track(timeout.NewestQC)
   135  
   136  	totalWeight, err := p.sigAggregator.VerifyAndAdd(timeout.SignerID, timeout.SigData, timeout.NewestQC.View)
   137  	if err != nil {
   138  		if model.IsInvalidSignerError(err) {
   139  			return model.NewInvalidTimeoutErrorf(timeout, "invalid signer for timeout: %w", err)
   140  		}
   141  		if errors.Is(err, model.ErrInvalidSignature) {
   142  			return model.NewInvalidTimeoutErrorf(timeout, "timeout is from valid signer but has cryptographically invalid signature: %w", err)
   143  		}
   144  		// model.DuplicatedSignerError is an expected error and just bubbled up the call stack.
   145  		// It does _not necessarily_ imply that the timeout is invalid or the sender is equivocating.
   146  		return fmt.Errorf("adding signature to aggregator failed: %w", err)
   147  	}
   148  	p.log.Debug().Msgf("processed timeout, total weight=(%d), required=(%d)", totalWeight, p.tcTracker.minRequiredWeight)
   149  
   150  	if p.partialTCTracker.Track(totalWeight) {
   151  		p.notifier.OnPartialTcCreated(p.view, p.newestQCTracker.NewestQC(), timeout.LastViewTC)
   152  	}
   153  
   154  	// Checking of conditions for building TC are satisfied when willBuildTC is true.
   155  	// At this point, we have enough signatures to build a TC. Another routine
   156  	// might just be at this point. To avoid duplicate work, Track returns true only once.
   157  	willBuildTC := p.tcTracker.Track(totalWeight)
   158  	if !willBuildTC {
   159  		// either we do not have enough timeouts to build a TC, or another thread
   160  		// has already passed this gate and created a TC
   161  		return nil
   162  	}
   163  
   164  	tc, err := p.buildTC()
   165  	if err != nil {
   166  		return fmt.Errorf("internal error constructing TC: %w", err)
   167  	}
   168  	p.notifier.OnTcConstructedFromTimeouts(tc)
   169  
   170  	return nil
   171  }
   172  
   173  // validateTimeout performs validation of timeout object, verifies if timeout is correctly structured
   174  // and included QC and TC is correctly structured and signed.
   175  // ATTENTION: this function does _not_ check whether the TO's `SignerID` is an authorized node nor if
   176  // the signature is valid. These checks happen in signature aggregator.
   177  // Expected error returns during normal operations:
   178  // * model.InvalidTimeoutError - submitted invalid timeout
   179  // All other errors should be treated as exceptions.
   180  func (p *TimeoutProcessor) validateTimeout(timeout *model.TimeoutObject) error {
   181  	// 1. check if it's correctly structured
   182  	// (a) Every TO must contain a QC
   183  	if timeout.NewestQC == nil {
   184  		return model.NewInvalidTimeoutErrorf(timeout, "TimeoutObject without QC is invalid")
   185  	}
   186  
   187  	if timeout.View <= timeout.NewestQC.View {
   188  		return model.NewInvalidTimeoutErrorf(timeout, "TO's QC %d cannot be newer than the TO's view %d",
   189  			timeout.NewestQC.View, timeout.View)
   190  	}
   191  
   192  	// (b) If a TC is included, the TC must be for the past round, no matter whether a QC
   193  	//     for the last round is also included. In some edge cases, a node might observe
   194  	//     _both_ QC and TC for the previous round, in which case it can include both.
   195  	if timeout.LastViewTC != nil {
   196  		if timeout.View != timeout.LastViewTC.View+1 {
   197  			return model.NewInvalidTimeoutErrorf(timeout, "invalid TC for non-previous view, expected view %d, got view %d", timeout.View-1, timeout.LastViewTC.View)
   198  		}
   199  		if timeout.NewestQC.View < timeout.LastViewTC.NewestQC.View {
   200  			return model.NewInvalidTimeoutErrorf(timeout, "timeout.NewestQC is older (view=%d) than the QC in timeout.LastViewTC (view=%d)", timeout.NewestQC.View, timeout.LastViewTC.NewestQC.View)
   201  		}
   202  	}
   203  	// (c) The TO must contain a proof that sender legitimately entered timeout.View. Transitioning
   204  	//     to round timeout.View is possible either by observing a QC or a TC for the previous round.
   205  	//     If no QC is included, we require a TC to be present, which by check (1b) must be for
   206  	//     the previous round.
   207  	lastViewSuccessful := timeout.View == timeout.NewestQC.View+1
   208  	if !lastViewSuccessful {
   209  		// The TO's sender did _not_ observe a QC for round timeout.View-1. Hence, it should
   210  		// include a TC for the previous round. Otherwise, the TO is invalid.
   211  		if timeout.LastViewTC == nil {
   212  			return model.NewInvalidTimeoutErrorf(timeout, "timeout must include TC")
   213  		}
   214  	}
   215  
   216  	// 2. Check if QC is valid
   217  	err := p.validator.ValidateQC(timeout.NewestQC)
   218  	if err != nil {
   219  		if model.IsInvalidQCError(err) {
   220  			return model.NewInvalidTimeoutErrorf(timeout, "included QC is invalid: %w", err)
   221  		}
   222  		if errors.Is(err, model.ErrViewForUnknownEpoch) {
   223  			// We require each replica to be bootstrapped with a QC pointing to a finalized block. Therefore, we should know the
   224  			// Epoch for any QC.View and TC.View we encounter. Receiving a `model.ErrViewForUnknownEpoch` is conceptually impossible,
   225  			// i.e. a symptom of an internal bug or invalid bootstrapping information.
   226  			return fmt.Errorf("no Epoch information availalbe for QC that was included in TO; symptom of internal bug or invalid bootstrapping information: %s", err.Error())
   227  		}
   228  		return fmt.Errorf("unexpected error when validating QC: %w", err)
   229  	}
   230  
   231  	// 3. If TC is included, it must be valid
   232  	if timeout.LastViewTC != nil {
   233  		err = p.validator.ValidateTC(timeout.LastViewTC)
   234  		if err != nil {
   235  			if model.IsInvalidTCError(err) {
   236  				return model.NewInvalidTimeoutErrorf(timeout, "included TC is invalid: %w", err)
   237  			}
   238  			if errors.Is(err, model.ErrViewForUnknownEpoch) {
   239  				// We require each replica to be bootstrapped with a QC pointing to a finalized block. Therefore, we should know the
   240  				// Epoch for any QC.View and TC.View we encounter. Receiving a `model.ErrViewForUnknownEpoch` is conceptually impossible,
   241  				// i.e. a symptom of an internal bug or invalid bootstrapping information.
   242  				return fmt.Errorf("no Epoch information availalbe for TC that was included in TO; symptom of internal bug or invalid bootstrapping information: %s", err.Error())
   243  			}
   244  			return fmt.Errorf("unexpected error when validating TC: %w", err)
   245  		}
   246  	}
   247  	return nil
   248  
   249  }
   250  
   251  // buildTC performs aggregation of signatures when we have collected enough
   252  // weight for building TC. This function is run only once by single worker.
   253  // Any error should be treated as exception.
   254  func (p *TimeoutProcessor) buildTC() (*flow.TimeoutCertificate, error) {
   255  	signersData, aggregatedSig, err := p.sigAggregator.Aggregate()
   256  	if err != nil {
   257  		return nil, fmt.Errorf("could not aggregate multi message signature: %w", err)
   258  	}
   259  
   260  	// IMPORTANT: To properly verify an aggregated signature included in TC we need to provide list of signers with corresponding
   261  	// messages(`TimeoutCertificate.NewestQCViews`) for each signer. If the one-to-once correspondence of view and signer is not maintained,
   262  	// it won't be possible to verify the aggregated signature.
   263  	// Aggregate returns an unordered set of signers together with additional data.
   264  	// Due to implementation specifics of signer indices, the decoding step results in canonically ordered signer ids, which means
   265  	// we need to canonically order the respective `newestQCView`, so we can properly map signer to `newestQCView` after decoding.
   266  
   267  	// sort data in canonical order
   268  	slices.SortFunc(signersData, func(lhs, rhs hotstuff.TimeoutSignerInfo) int {
   269  		return flow.IdentifierCanonical(lhs.Signer, rhs.Signer)
   270  	})
   271  
   272  	// extract signers and data separately
   273  	signers := make([]flow.Identifier, 0, len(signersData))
   274  	newestQCViews := make([]uint64, 0, len(signersData))
   275  	for _, data := range signersData {
   276  		signers = append(signers, data.Signer)
   277  		newestQCViews = append(newestQCViews, data.NewestQCView)
   278  	}
   279  
   280  	signerIndices, err := p.signerIndicesFromIdentities(signers)
   281  	if err != nil {
   282  		return nil, fmt.Errorf("could not encode signer indices: %w", err)
   283  	}
   284  
   285  	// Note that `newestQC` can have a larger view than any of the views included in `newestQCViews`.
   286  	// This is because for a TO currently being processes following two operations are executed in separate steps:
   287  	// * updating the `newestQCTracker` with the QC from the TO
   288  	// * adding the TO's signature to `sigAggregator`
   289  	// Therefore, races are possible, where the `newestQCTracker` already knows of a QC with larger view
   290  	// than the data stored in `sigAggregator`.
   291  	newestQC := p.newestQCTracker.NewestQC()
   292  
   293  	return &flow.TimeoutCertificate{
   294  		View:          p.view,
   295  		NewestQCViews: newestQCViews,
   296  		NewestQC:      newestQC,
   297  		SignerIndices: signerIndices,
   298  		SigData:       aggregatedSig,
   299  	}, nil
   300  }
   301  
   302  // signerIndicesFromIdentities encodes identities into signer indices.
   303  // Any error should be treated as exception.
   304  func (p *TimeoutProcessor) signerIndicesFromIdentities(signerIDs flow.IdentifierList) ([]byte, error) {
   305  	allIdentities, err := p.committee.IdentitiesByEpoch(p.view)
   306  	if err != nil {
   307  		return nil, fmt.Errorf("could not retrieve identities for view %d: %w", p.view, err)
   308  	}
   309  	signerIndices, err := signature.EncodeSignersToIndices(allIdentities.NodeIDs(), signerIDs)
   310  	if err != nil {
   311  		return nil, fmt.Errorf("could not encode signer identifiers to indices: %w", err)
   312  	}
   313  	return signerIndices, nil
   314  }