github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/common/follower/compliance_core.go (about) 1 package follower 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 8 "github.com/rs/zerolog" 9 10 "github.com/onflow/flow-go/consensus/hotstuff" 11 "github.com/onflow/flow-go/consensus/hotstuff/model" 12 "github.com/onflow/flow-go/engine/common/follower/cache" 13 "github.com/onflow/flow-go/engine/common/follower/pending_tree" 14 "github.com/onflow/flow-go/model/flow" 15 "github.com/onflow/flow-go/module" 16 "github.com/onflow/flow-go/module/component" 17 "github.com/onflow/flow-go/module/irrecoverable" 18 "github.com/onflow/flow-go/module/trace" 19 "github.com/onflow/flow-go/state/protocol" 20 ) 21 22 // CertifiedBlocks is a connected list of certified blocks, in ascending height order. 23 type CertifiedBlocks []flow.CertifiedBlock 24 25 // defaultCertifiedRangeChannelCapacity maximum capacity of buffered channel that is used to transfer ranges of 26 // certified blocks to specific worker. 27 // Channel buffers ranges which consist of multiple blocks, so the real capacity of channel is larger 28 const defaultCertifiedRangeChannelCapacity = 20 29 30 // defaultFinalizedBlocksChannelCapacity maximum capacity of buffered channel that is used to transfer 31 // finalized blocks to specific worker. 32 const defaultFinalizedBlocksChannelCapacity = 10 33 34 // defaultPendingBlocksCacheCapacity maximum capacity of cache for pending blocks. 35 const defaultPendingBlocksCacheCapacity = 1000 36 37 // ComplianceCore implements main processing logic for follower engine. 38 // Generally is NOT concurrency safe but some functions can be used in concurrent setup. 39 type ComplianceCore struct { 40 *component.ComponentManager 41 log zerolog.Logger 42 mempoolMetrics module.MempoolMetrics 43 tracer module.Tracer 44 proposalViolationNotifier hotstuff.ProposalViolationConsumer 45 pendingCache *cache.Cache 46 pendingTree *pending_tree.PendingTree 47 state protocol.FollowerState 48 follower module.HotStuffFollower 49 validator hotstuff.Validator 50 sync module.BlockRequester 51 certifiedRangesChan chan CertifiedBlocks // delivers ranges of certified blocks to main core worker 52 finalizedBlocksChan chan *flow.Header // delivers finalized blocks to main core worker. 53 } 54 55 var _ complianceCore = (*ComplianceCore)(nil) 56 57 // NewComplianceCore creates new instance of ComplianceCore. 58 // No errors expected during normal operations. 59 func NewComplianceCore(log zerolog.Logger, 60 mempoolMetrics module.MempoolMetrics, 61 heroCacheCollector module.HeroCacheMetrics, 62 followerConsumer hotstuff.FollowerConsumer, 63 state protocol.FollowerState, 64 follower module.HotStuffFollower, 65 validator hotstuff.Validator, 66 sync module.BlockRequester, 67 tracer module.Tracer, 68 ) (*ComplianceCore, error) { 69 finalizedBlock, err := state.Final().Head() 70 if err != nil { 71 return nil, fmt.Errorf("could not query finalized block: %w", err) 72 } 73 74 c := &ComplianceCore{ 75 log: log.With().Str("engine", "follower_core").Logger(), 76 mempoolMetrics: mempoolMetrics, 77 state: state, 78 proposalViolationNotifier: followerConsumer, 79 pendingCache: cache.NewCache(log, defaultPendingBlocksCacheCapacity, heroCacheCollector, followerConsumer), 80 pendingTree: pending_tree.NewPendingTree(finalizedBlock), 81 follower: follower, 82 validator: validator, 83 sync: sync, 84 tracer: tracer, 85 certifiedRangesChan: make(chan CertifiedBlocks, defaultCertifiedRangeChannelCapacity), 86 finalizedBlocksChan: make(chan *flow.Header, defaultFinalizedBlocksChannelCapacity), 87 } 88 89 // prune cache to latest finalized view 90 c.pendingCache.PruneUpToView(finalizedBlock.View) 91 92 c.ComponentManager = component.NewComponentManagerBuilder(). 93 AddWorker(c.processCoreSeqEvents). 94 Build() 95 96 return c, nil 97 } 98 99 // OnBlockRange processes a range of connected blocks. It validates the incoming batch, adds it to cache of pending 100 // blocks and schedules certified blocks for further processing. The input list must be sequentially ordered forming 101 // a chain, i.e. connectedRange[i] is the parent of connectedRange[i+1]. Submitting a disconnected batch results in 102 // an `ErrDisconnectedBatch` error and the batch is dropped (no-op). 103 // This method is safe to use in concurrent environment. 104 // Caution: method might block if internally too many certified blocks are queued in the channel `certifiedRangesChan`. 105 // Expected errors during normal operations: 106 // - cache.ErrDisconnectedBatch 107 func (c *ComplianceCore) OnBlockRange(originID flow.Identifier, batch []*flow.Block) error { 108 if len(batch) < 1 { 109 return nil 110 } 111 112 firstBlock := batch[0].Header 113 lastBlock := batch[len(batch)-1].Header 114 hotstuffProposal := model.ProposalFromFlow(lastBlock) 115 log := c.log.With(). 116 Hex("origin_id", originID[:]). 117 Str("chain_id", lastBlock.ChainID.String()). 118 Uint64("first_block_height", firstBlock.Height). 119 Uint64("first_block_view", firstBlock.View). 120 Uint64("last_block_height", lastBlock.Height). 121 Uint64("last_block_view", lastBlock.View). 122 Hex("last_block_id", hotstuffProposal.Block.BlockID[:]). 123 Int("range_length", len(batch)). 124 Logger() 125 126 log.Info().Msg("processing block range") 127 128 if c.pendingCache.Peek(hotstuffProposal.Block.BlockID) == nil { 129 log.Debug().Msg("block not found in cache, performing validation") 130 // Caution: we are _not_ checking the proposal's full validity here. Instead, we need to check 131 // the following two critical properties: 132 // 1. The block has been signed by the legitimate primary for the view. This is important in case 133 // there are multiple blocks for the view. We need to differentiate the following byzantine cases: 134 // (i) Some other consensus node that is _not_ primary is trying to publish a block. 135 // This would result in the validation below failing with an `InvalidProposalError`. 136 // (ii) The legitimate primary for the view is equivocating. In this case, the validity check 137 // below would pass. Though, the `PendingTree` would eventually notice this, when we connect 138 // the equivocating blocks to the latest finalized block. 139 // 2. The QC within the block is valid. A valid QC proves validity of all ancestors. 140 err := c.validator.ValidateProposal(hotstuffProposal) 141 if err != nil { 142 if invalidBlockError, ok := model.AsInvalidProposalError(err); ok { 143 c.proposalViolationNotifier.OnInvalidBlockDetected(flow.Slashable[model.InvalidProposalError]{ 144 OriginID: originID, 145 Message: *invalidBlockError, 146 }) 147 return nil 148 } 149 if errors.Is(err, model.ErrViewForUnknownEpoch) { 150 // We have received a proposal, but we don't know the epoch its view is within. 151 // Conceptually, there are three scenarios that could lead to this edge-case: 152 // 1. the proposer maliciously created the block for a view very far in the future (it's invalid) 153 // -> in this case we can disregard the block 154 // 2. This node is very far behind and hasn't processed enough blocks to observe the EpochCommit 155 // service event. 156 // -> in this case we can disregard the block 157 // Note: we could eliminate this edge case by dropping future blocks, iff their _view_ 158 // is strictly larger than `V + EpochCommitSafetyThreshold`, where `V` denotes 159 // the latest finalized block known to this node. 160 // 3. No blocks have been finalized for the last `EpochCommitSafetyThreshold` views. This breaks 161 // a critical liveness assumption - see EpochCommitSafetyThreshold in protocol.Params for details. 162 // -> In this case, it is ok for the protocol to halt. Consequently, we can just disregard 163 // the block, which will probably lead to this node eventually halting. 164 log.Err(err).Msg("unable to validate proposal with view from unknown epoch") 165 return nil 166 } 167 return fmt.Errorf("unexpected error validating proposal: %w", err) 168 } 169 } 170 171 certifiedBatch, certifyingQC, err := c.pendingCache.AddBlocks(batch) 172 if err != nil { 173 return fmt.Errorf("could not add a range of pending blocks: %w", err) // ErrDisconnectedBatch or exception 174 } 175 log.Debug().Msgf("caching block range resulted in %d certified blocks (possibly including additional cached blocks)", len(certifiedBatch)) 176 177 if len(certifiedBatch) < 1 { 178 return nil 179 } 180 certifiedRange, err := rangeToCertifiedBlocks(certifiedBatch, certifyingQC) 181 if err != nil { 182 return fmt.Errorf("converting the certified batch to list of certified blocks failed: %w", err) 183 } 184 185 // in case we have already stopped our worker, we use a select statement to avoid 186 // blocking since there is no active consumer for this channel 187 select { 188 case c.certifiedRangesChan <- certifiedRange: 189 case <-c.ComponentManager.ShutdownSignal(): 190 } 191 return nil 192 } 193 194 // processCoreSeqEvents processes events that need to be dispatched on dedicated core's goroutine. 195 // Here we process events that need to be sequentially ordered(processing certified blocks and new finalized blocks). 196 // Implements `component.ComponentWorker` signature. 197 // Is NOT concurrency safe: should be executed by _single dedicated_ goroutine. 198 func (c *ComplianceCore) processCoreSeqEvents(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 199 ready() 200 201 doneSignal := ctx.Done() 202 for { 203 select { 204 case <-doneSignal: 205 return 206 case finalized := <-c.finalizedBlocksChan: 207 err := c.processFinalizedBlock(ctx, finalized) // no errors expected during normal operations 208 if err != nil { 209 ctx.Throw(err) 210 } 211 case blocks := <-c.certifiedRangesChan: 212 err := c.processCertifiedBlocks(ctx, blocks) // no errors expected during normal operations 213 if err != nil { 214 ctx.Throw(err) 215 } 216 } 217 } 218 } 219 220 // OnFinalizedBlock updates local state of pendingCache tree using received finalized block and queues finalized block 221 // to be processed by internal goroutine. 222 // This function is safe to use in concurrent environment. 223 // CAUTION: this function blocks and hence is not compliant with the `FinalizationConsumer.OnFinalizedBlock` interface. 224 func (c *ComplianceCore) OnFinalizedBlock(final *flow.Header) { 225 c.pendingCache.PruneUpToView(final.View) 226 227 // in-case we have already stopped our worker we use a select statement to avoid 228 // blocking since there is no active consumer for this channel 229 select { 230 case c.finalizedBlocksChan <- final: 231 case <-c.ComponentManager.ShutdownSignal(): 232 } 233 } 234 235 // processCertifiedBlocks processes the batch of certified blocks: 236 // 1. We add the certified blocks to the PendingTree. This might causes the pending PendingTree to detect 237 // additional blocks as now being connected to the latest finalized block. Specifically, the PendingTree 238 // returns the list `connectedBlocks`, which contains the subset of `blocks` that are connect to the 239 // finalized block plus all of their connected descendants. The list `connectedBlocks` is in 'parent first' 240 // order, i.e. a block is listed before any of its descendants. The PendingTree guarantees that all 241 // ancestors are listed, _unless_ the ancestor is the finalized block or the ancestor has been returned 242 // by a previous call to `PendingTree.AddBlocks`. 243 // 2. We extend the protocol state with the connected certified blocks from step 1. 244 // 3. We submit the connected certified blocks from step 1 to the consensus follower. 245 // 246 // Is NOT concurrency safe: should be executed by _single dedicated_ goroutine. 247 // No errors expected during normal operations. 248 func (c *ComplianceCore) processCertifiedBlocks(ctx context.Context, blocks CertifiedBlocks) error { 249 span, ctx := c.tracer.StartSpanFromContext(ctx, trace.FollowerProcessCertifiedBlocks) 250 defer span.End() 251 252 // Step 1: add blocks to our PendingTree of certified blocks 253 pendingTreeSpan, _ := c.tracer.StartSpanFromContext(ctx, trace.FollowerExtendPendingTree) 254 connectedBlocks, err := c.pendingTree.AddBlocks(blocks) 255 pendingTreeSpan.End() 256 if err != nil { 257 return fmt.Errorf("could not process batch of certified blocks: %w", err) 258 } 259 260 // Step 2 & 3: extend protocol state with connected certified blocks and forward them to consensus follower 261 for _, certifiedBlock := range connectedBlocks { 262 s, _ := c.tracer.StartBlockSpan(ctx, certifiedBlock.ID(), trace.FollowerExtendProtocolState) 263 err = c.state.ExtendCertified(ctx, certifiedBlock.Block, certifiedBlock.CertifyingQC) 264 s.End() 265 if err != nil { 266 return fmt.Errorf("could not extend protocol state with certified block: %w", err) 267 } 268 269 b, err := model.NewCertifiedBlock(model.BlockFromFlow(certifiedBlock.Block.Header), certifiedBlock.CertifyingQC) 270 if err != nil { 271 return fmt.Errorf("failed to convert certified block %v to HotStuff type: %w", certifiedBlock.Block.ID(), err) 272 } 273 c.follower.AddCertifiedBlock(&b) // submit the model to follower for async processing 274 } 275 return nil 276 } 277 278 // processFinalizedBlock informs the PendingTree about finalization of the given block. 279 // Is NOT concurrency safe: should be executed by _single dedicated_ goroutine. 280 // No errors expected during normal operations. 281 func (c *ComplianceCore) processFinalizedBlock(ctx context.Context, finalized *flow.Header) error { 282 span, _ := c.tracer.StartSpanFromContext(ctx, trace.FollowerProcessFinalizedBlock) 283 defer span.End() 284 285 connectedBlocks, err := c.pendingTree.FinalizeFork(finalized) 286 if err != nil { 287 return fmt.Errorf("could not process finalized fork at view %d: %w", finalized.View, err) 288 } 289 // The pending tree allows to skip ahead, which makes the algorithm more general and simplifies its implementation. 290 // However, here we are implementing the consensus follower, which cannot skip ahead. This is because the consensus 291 // follower locally determines finality and therefore must ingest every block. In other words: ever block that is 292 // later finalized must have been connected before. Otherwise, the block would never have been forwarded to the 293 // HotStuff follower and no finalization notification would have been triggered. 294 // Therefore, from the perspective of the consensus follower, receiving a _non-empty_ `connectedBlocks` is a 295 // symptom of internal state corruption or a bug. 296 if len(connectedBlocks) > 0 { 297 return fmt.Errorf("finalizing block %v caused the PendingTree to connect additional blocks, which is a symptom of internal state corruption or a bug", finalized.ID()) 298 } 299 return nil 300 } 301 302 // rangeToCertifiedBlocks transform batch of connected blocks and a QC that certifies last block to a range of 303 // certified and connected blocks. 304 // Pure function (side-effect free). No errors expected during normal operations. 305 func rangeToCertifiedBlocks(certifiedRange []*flow.Block, certifyingQC *flow.QuorumCertificate) (CertifiedBlocks, error) { 306 certifiedBlocks := make(CertifiedBlocks, 0, len(certifiedRange)) 307 lastIndex := len(certifiedRange) - 1 308 for i, block := range certifiedRange { 309 var qc *flow.QuorumCertificate 310 if i < lastIndex { 311 qc = certifiedRange[i+1].Header.QuorumCertificate() 312 } else { 313 qc = certifyingQC 314 } 315 316 // bundle block and its certifying QC to `CertifiedBlock`: 317 certBlock, err := flow.NewCertifiedBlock(block, qc) 318 if err != nil { 319 return nil, fmt.Errorf("constructing certified root block failed: %w", err) 320 } 321 certifiedBlocks = append(certifiedBlocks, certBlock) 322 } 323 return certifiedBlocks, nil 324 }