github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/collection/test/cluster_switchover_test.go (about) 1 package test 2 3 import ( 4 "sync" 5 "testing" 6 "time" 7 8 "github.com/stretchr/testify/mock" 9 "github.com/stretchr/testify/require" 10 11 "github.com/onflow/flow-go/cmd/bootstrap/run" 12 "github.com/onflow/flow-go/engine/testutil" 13 testmock "github.com/onflow/flow-go/engine/testutil/mock" 14 model "github.com/onflow/flow-go/model/bootstrap" 15 "github.com/onflow/flow-go/model/flow" 16 "github.com/onflow/flow-go/model/flow/factory" 17 "github.com/onflow/flow-go/model/flow/filter" 18 "github.com/onflow/flow-go/module" 19 "github.com/onflow/flow-go/module/util" 20 "github.com/onflow/flow-go/network/channels" 21 "github.com/onflow/flow-go/network/mocknetwork" 22 "github.com/onflow/flow-go/network/stub" 23 "github.com/onflow/flow-go/state/cluster" 24 bcluster "github.com/onflow/flow-go/state/cluster/badger" 25 "github.com/onflow/flow-go/state/protocol" 26 "github.com/onflow/flow-go/state/protocol/inmem" 27 "github.com/onflow/flow-go/state/protocol/protocol_state/kvstore" 28 protocol_state "github.com/onflow/flow-go/state/protocol/protocol_state/state" 29 "github.com/onflow/flow-go/utils/unittest" 30 ) 31 32 // ClusterSwitchoverTestCase comprises one test case of the cluster switchover. 33 // Collection nodes are assigned to one cluster each epoch. On epoch 34 // boundaries they must gracefully terminate cluster consensus for the ending 35 // epoch and begin cluster consensus the beginning epoch. These two consensus 36 // committees co-exist for a short period at the beginning of each epoch. 37 type ClusterSwitchoverTestCase struct { 38 t *testing.T 39 conf ClusterSwitchoverTestConf 40 41 nodeInfos []model.NodeInfo // identity table 42 hub *stub.Hub // mock network hub 43 root protocol.Snapshot // shared root snapshot 44 nodes []testmock.CollectionNode // collection nodes 45 sn *mocknetwork.Engine // fake consensus node engine for receiving guarantees 46 builder *unittest.EpochBuilder // utility for building epochs 47 48 // epoch counter -> cluster index -> transaction IDs 49 sentTransactions map[uint64]map[uint]flow.IdentifierList // track submitted transactions 50 } 51 52 // NewClusterSwitchoverTestCase constructs a new cluster switchover test case 53 // given the configuration, creating all dependencies and mock nodes. 54 func NewClusterSwitchoverTestCase(t *testing.T, conf ClusterSwitchoverTestConf) *ClusterSwitchoverTestCase { 55 56 tc := &ClusterSwitchoverTestCase{ 57 t: t, 58 conf: conf, 59 } 60 tc.nodeInfos = unittest.PrivateNodeInfosFromIdentityList( 61 unittest.CompleteIdentitySet( 62 unittest.IdentityListFixture(int(conf.collectors), unittest.WithRole(flow.RoleCollection))...), 63 ) 64 identities := model.ToIdentityList(tc.nodeInfos) 65 collectors := identities.Filter(filter.HasRole[flow.Identity](flow.RoleCollection)).ToSkeleton() 66 assignment := unittest.ClusterAssignment(tc.conf.clusters, collectors) 67 clusters, err := factory.NewClusterList(assignment, collectors) 68 require.NoError(t, err) 69 rootClusterBlocks := run.GenerateRootClusterBlocks(1, clusters) 70 rootClusterQCs := make([]flow.ClusterQCVoteData, len(rootClusterBlocks)) 71 for i, cluster := range clusters { 72 signers := make([]model.NodeInfo, 0) 73 for _, identity := range tc.nodeInfos { 74 if _, inCluster := cluster.ByNodeID(identity.NodeID); inCluster { 75 signers = append(signers, identity) 76 } 77 } 78 signerIdentities := model.ToIdentityList(signers).Sort(flow.Canonical[flow.Identity]).ToSkeleton() 79 qc, err := run.GenerateClusterRootQC(signers, signerIdentities, rootClusterBlocks[i]) 80 require.NoError(t, err) 81 rootClusterQCs[i] = flow.ClusterQCVoteDataFromQC(&flow.QuorumCertificateWithSignerIDs{ 82 View: qc.View, 83 BlockID: qc.BlockID, 84 SignerIDs: signerIdentities.NodeIDs(), 85 SigData: qc.SigData, 86 }) 87 } 88 89 tc.sentTransactions = make(map[uint64]map[uint]flow.IdentifierList) 90 tc.hub = stub.NewNetworkHub() 91 92 // create a root snapshot with the given number of initial clusters 93 root, result, seal := unittest.BootstrapFixture(identities) 94 qc := unittest.QuorumCertificateFixture(unittest.QCWithRootBlockID(root.ID())) 95 setup := result.ServiceEvents[0].Event.(*flow.EpochSetup) 96 commit := result.ServiceEvents[1].Event.(*flow.EpochCommit) 97 98 setup.Assignments = unittest.ClusterAssignment(tc.conf.clusters, identities.ToSkeleton()) 99 commit.ClusterQCs = rootClusterQCs 100 101 seal.ResultID = result.ID() 102 root.Payload.ProtocolStateID = kvstore.NewDefaultKVStore( 103 inmem.ProtocolStateFromEpochServiceEvents(setup, commit).ID()).ID() 104 tc.root, err = inmem.SnapshotFromBootstrapState(root, result, seal, qc) 105 require.NoError(t, err) 106 107 // build a lookup table for node infos 108 nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo) 109 for _, nodeInfo := range tc.nodeInfos { 110 nodeInfoLookup[nodeInfo.NodeID] = nodeInfo 111 } 112 113 // create a mock node for each collector identity 114 for _, collector := range collectors { 115 nodeInfo := nodeInfoLookup[collector.NodeID] 116 node := testutil.CollectionNode(tc.T(), tc.hub, nodeInfo, tc.root) 117 tc.nodes = append(tc.nodes, node) 118 } 119 120 // create a mock consensus node to receive collection guarantees 121 consensus := testutil.GenericNode( 122 tc.T(), 123 tc.hub, 124 nodeInfoLookup[identities.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0].NodeID], 125 tc.root, 126 ) 127 tc.sn = new(mocknetwork.Engine) 128 _, err = consensus.Net.Register(channels.ReceiveGuarantees, tc.sn) 129 require.NoError(tc.T(), err) 130 131 // create an epoch builder hooked to each collector's protocol state 132 states := make([]protocol.FollowerState, 0) 133 for _, node := range tc.nodes { 134 states = append(states, node.State) 135 } 136 137 // take first collection node and use its storage as data source for stateMutator 138 refNode := tc.nodes[0] 139 stateMutator := protocol_state.NewMutableProtocolState( 140 refNode.EpochProtocolState, 141 refNode.ProtocolKVStore, 142 refNode.State.Params(), 143 refNode.Headers, 144 refNode.Results, 145 refNode.Setups, 146 refNode.EpochCommits, 147 ) 148 149 // when building new epoch we would like to replace fixture cluster QCs with real ones, for that we need 150 // to generate them using node infos 151 tc.builder = unittest.NewEpochBuilder(tc.T(), stateMutator, states...).UsingCommitOpts(func(commit *flow.EpochCommit) { 152 // build a lookup table for node infos 153 nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo) 154 for _, nodeInfo := range tc.nodeInfos { 155 nodeInfoLookup[nodeInfo.NodeID] = nodeInfo 156 } 157 158 // replace cluster QCs, with real data 159 for i, clusterQC := range commit.ClusterQCs { 160 clusterParticipants := flow.IdentifierList(clusterQC.VoterIDs).Lookup() 161 signers := make([]model.NodeInfo, 0, len(clusterParticipants)) 162 for _, signerID := range clusterQC.VoterIDs { 163 signer := nodeInfoLookup[signerID] 164 signers = append(signers, signer) 165 } 166 167 // generate root cluster block 168 rootClusterBlock := cluster.CanonicalRootBlock(commit.Counter, model.ToIdentityList(signers).ToSkeleton()) 169 // generate cluster root qc 170 qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers).ToSkeleton(), rootClusterBlock) 171 require.NoError(t, err) 172 signerIDs := toSignerIDs(signers) 173 qcWithSignerIDs := &flow.QuorumCertificateWithSignerIDs{ 174 View: qc.View, 175 BlockID: qc.BlockID, 176 SignerIDs: signerIDs, 177 SigData: qc.SigData, 178 } 179 commit.ClusterQCs[i] = flow.ClusterQCVoteDataFromQC(qcWithSignerIDs) 180 } 181 }) 182 183 return tc 184 } 185 186 func toSignerIDs(signers []model.NodeInfo) []flow.Identifier { 187 signerIDs := make([]flow.Identifier, 0, len(signers)) 188 for _, signer := range signers { 189 signerIDs = append(signerIDs, signer.NodeID) 190 } 191 return signerIDs 192 } 193 194 // TestClusterSwitchover_Simple is the simplest switchover case with one single-node cluster. 195 func TestClusterSwitchover_Simple(t *testing.T) { 196 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 197 clusters: 1, 198 collectors: 1, 199 })) 200 } 201 202 // TestClusterSwitchover_MultiCollectorCluster tests switchover with a cluster 203 // containing more than one collector. 204 func TestClusterSwitchover_MultiCollectorCluster(t *testing.T) { 205 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 206 clusters: 1, 207 collectors: 2, 208 })) 209 } 210 211 // TestClusterSwitchover_MultiCluster tests cluster switchover with two clusters. 212 func TestClusterSwitchover_MultiCluster(t *testing.T) { 213 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 214 clusters: 2, 215 collectors: 2, 216 })) 217 } 218 219 // ClusterSwitchoverTestConf configures a test case. 220 type ClusterSwitchoverTestConf struct { 221 clusters uint // # of clusters each epoch 222 collectors uint // # of collectors each epoch 223 } 224 225 func (tc *ClusterSwitchoverTestCase) T() *testing.T { 226 return tc.t 227 } 228 229 // StartNodes starts all collection nodes in the suite and turns on continuous 230 // delivery in the stub network. 231 func (tc *ClusterSwitchoverTestCase) StartNodes() { 232 233 // start all node components 234 nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes)) 235 for _, node := range tc.nodes { 236 node.Start(tc.T()) 237 nodes = append(nodes, node) 238 } 239 240 unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes") 241 242 // start continuous delivery for all nodes 243 for _, node := range tc.nodes { 244 node.Net.StartConDev(10*time.Millisecond, false) 245 } 246 } 247 248 func (tc *ClusterSwitchoverTestCase) StopNodes() { 249 nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes)) 250 for _, node := range tc.nodes { 251 nodes = append(nodes, node) 252 } 253 unittest.RequireCloseBefore(tc.T(), util.AllDone(nodes...), time.Second, "could not stop nodes") 254 } 255 256 func (tc *ClusterSwitchoverTestCase) RootBlock() *flow.Header { 257 head, err := tc.root.Head() 258 require.NoError(tc.T(), err) 259 return head 260 } 261 262 func (tc *ClusterSwitchoverTestCase) ServiceAddress() flow.Address { 263 return tc.RootBlock().ChainID.Chain().ServiceAddress() 264 } 265 266 // Transaction returns a transaction which is valid for ingestion by a 267 // collection node in this test suite. 268 func (tc *ClusterSwitchoverTestCase) Transaction(opts ...func(*flow.TransactionBody)) *flow.TransactionBody { 269 tx := flow.NewTransactionBody(). 270 AddAuthorizer(tc.ServiceAddress()). 271 SetPayer(tc.ServiceAddress()). 272 SetScript(unittest.NoopTxScript()). 273 SetReferenceBlockID(tc.RootBlock().ID()) 274 275 for _, apply := range opts { 276 apply(tx) 277 } 278 279 return tx 280 } 281 282 // ExpectTransaction asserts that the test case expects the given transaction 283 // to be included in the given cluster state for the given epoch. 284 func (tc *ClusterSwitchoverTestCase) ExpectTransaction(epochCounter uint64, clusterIndex uint, txID flow.Identifier) { 285 if _, ok := tc.sentTransactions[epochCounter]; !ok { 286 tc.sentTransactions[epochCounter] = make(map[uint]flow.IdentifierList) 287 } 288 tc.T().Logf("expecting transaction %x in epoch %d for cluster %d", txID, epochCounter, clusterIndex) 289 expected := tc.sentTransactions[epochCounter][clusterIndex] 290 expected = append(expected, txID) 291 tc.sentTransactions[epochCounter][clusterIndex] = expected 292 } 293 294 // ClusterState opens and returns a read-only cluster state for the given node and cluster ID. 295 func (tc *ClusterSwitchoverTestCase) ClusterState(node testmock.CollectionNode, clusterID flow.ChainID, epoch uint64) cluster.State { 296 state, err := bcluster.OpenState(node.PublicDB, node.Tracer, node.Headers, node.ClusterPayloads, clusterID, epoch) 297 require.NoError(tc.T(), err) 298 return state 299 } 300 301 // State returns the protocol state. 302 func (tc *ClusterSwitchoverTestCase) State() protocol.State { 303 return tc.nodes[0].State 304 } 305 306 // Collector returns the mock node for the collector with the given ID. 307 func (tc *ClusterSwitchoverTestCase) Collector(id flow.Identifier) testmock.CollectionNode { 308 for _, node := range tc.nodes { 309 if node.Me.NodeID() == id { 310 return node 311 } 312 } 313 tc.T().FailNow() 314 return testmock.CollectionNode{} 315 } 316 317 // Clusters returns the clusters for the current epoch. 318 func (tc *ClusterSwitchoverTestCase) Clusters(epoch protocol.Epoch) []protocol.Cluster { 319 clustering, err := epoch.Clustering() 320 require.NoError(tc.T(), err) 321 322 clusters := make([]protocol.Cluster, 0, len(clustering)) 323 for i := uint(0); i < uint(len(clustering)); i++ { 324 cluster, err := epoch.Cluster(i) 325 require.NoError(tc.T(), err) 326 clusters = append(clusters, cluster) 327 } 328 329 return clusters 330 } 331 332 // BlockInEpoch returns the highest block that exists within the bounds of the 333 // epoch with the given epoch counter. 334 func (tc *ClusterSwitchoverTestCase) BlockInEpoch(epochCounter uint64) *flow.Header { 335 root := tc.RootBlock() 336 337 for height := root.Height; ; height++ { 338 curr := tc.State().AtHeight(height) 339 next := tc.State().AtHeight(height + 1) 340 curCounter, err := curr.Epochs().Current().Counter() 341 require.NoError(tc.T(), err) 342 nextCounter, err := next.Epochs().Current().Counter() 343 // if we reach a point where the next block doesn't exist, but the 344 // current block has the correct counter, return the current block 345 if err != nil && curCounter == epochCounter { 346 head, err := curr.Head() 347 require.NoError(tc.T(), err) 348 return head 349 } 350 351 // otherwise, wait until we reach the block where the next block is in 352 // the next epoch - this is the highest block in the requested epoch 353 if curCounter == epochCounter && nextCounter == epochCounter+1 { 354 head, err := curr.Head() 355 require.NoError(tc.T(), err) 356 return head 357 } 358 } 359 } 360 361 // SubmitTransactionToCluster submits a transaction to the given cluster in 362 // the given epoch and marks the transaction as expected for inclusion in 363 // the corresponding cluster state. 364 func (tc *ClusterSwitchoverTestCase) SubmitTransactionToCluster( 365 epochCounter uint64, // the epoch we are submitting the transacting w.r.t. 366 clustering flow.ClusterList, // the clustering for the epoch 367 clusterIndex uint, // the index of the cluster we are targetting 368 ) { 369 370 clusterMembers := clustering[int(clusterIndex)] 371 // get any block within the target epoch as the transaction's reference block 372 refBlock := tc.BlockInEpoch(epochCounter) 373 tx := tc.Transaction(func(tx *flow.TransactionBody) { 374 tx.SetReferenceBlockID(refBlock.ID()) 375 }) 376 clusterTx := unittest.AlterTransactionForCluster(*tx, clustering, clusterMembers, nil) 377 tc.ExpectTransaction(epochCounter, clusterIndex, clusterTx.ID()) 378 379 // submit the transaction to any collector in this cluster 380 err := tc.Collector(clusterMembers[0].NodeID).IngestionEngine.ProcessTransaction(&clusterTx) 381 require.NoError(tc.T(), err) 382 } 383 384 // CheckClusterState checks the cluster state of the given node (within the given 385 // cluster) and asserts that only transaction specified by ExpectTransaction are 386 // included. 387 func (tc *ClusterSwitchoverTestCase) CheckClusterState( 388 identity *flow.IdentitySkeleton, 389 clusterInfo protocol.Cluster, 390 ) { 391 node := tc.Collector(identity.NodeID) 392 state := tc.ClusterState(node, clusterInfo.ChainID(), clusterInfo.EpochCounter()) 393 expected := tc.sentTransactions[clusterInfo.EpochCounter()][clusterInfo.Index()] 394 unittest.NewClusterStateChecker(state). 395 ExpectTxCount(len(expected)). 396 ExpectContainsTx(expected...). 397 Assert(tc.T()) 398 } 399 400 // Timeout returns the timeout for async tasks for this test case. 401 func (tc *ClusterSwitchoverTestCase) Timeout() time.Duration { 402 // 60s + 10s for each collector 403 // locally the whole suite takes 404 // * ~8s when run alone 405 // * ~15-20s when run in parallel with other packages (default) 406 return 60*time.Second + 10*time.Second*time.Duration(tc.conf.collectors) 407 } 408 409 // RunTestCase comprises the core test logic for cluster switchover. We build 410 // an epoch, which triggers the beginning of the epoch 2 cluster consensus, then 411 // send transactions targeting clusters from both epochs while both are running. 412 func RunTestCase(tc *ClusterSwitchoverTestCase) { 413 414 tc.StartNodes() 415 defer tc.StopNodes() 416 417 // keep track of guarantees received at the mock consensus node 418 // when a guarantee is received, it indicates that the sender has finalized 419 // the corresponding cluster block 420 expectedGuaranteesPerEpoch := int(tc.conf.collectors) 421 waitForGuarantees := new(sync.WaitGroup) 422 waitForGuarantees.Add(expectedGuaranteesPerEpoch) 423 tc.sn.On("Process", mock.Anything, mock.Anything, mock.Anything). 424 Return(nil). 425 Run(func(args mock.Arguments) { 426 id, ok := args[1].(flow.Identifier) 427 require.True(tc.T(), ok) 428 _, ok = args[2].(*flow.CollectionGuarantee) 429 tc.T().Log("got guarantee from", id.String()) 430 require.True(tc.T(), ok) 431 waitForGuarantees.Done() 432 }). 433 Times(expectedGuaranteesPerEpoch * 2) 434 435 // build the epoch, ending on the first block on the next epoch 436 tc.builder.BuildEpoch().CompleteEpoch() 437 // build halfway through the grace period for the epoch 1 cluster 438 tc.builder.AddBlocksWithSeals(flow.DefaultTransactionExpiry/2, 1) 439 440 epoch1 := tc.State().Final().Epochs().Previous() 441 epoch2 := tc.State().Final().Epochs().Current() 442 443 epoch1Clusters := tc.Clusters(epoch1) 444 epoch2Clusters := tc.Clusters(epoch2) 445 epoch1Clustering, err := epoch1.Clustering() 446 require.NoError(tc.T(), err) 447 epoch2Clustering, err := epoch2.Clustering() 448 require.NoError(tc.T(), err) 449 450 // submit transactions targeting epoch 1 clusters 451 for clusterIndex := range epoch1Clustering { 452 tc.SubmitTransactionToCluster(1, epoch1Clustering, uint(clusterIndex)) 453 } 454 455 // wait for epoch 1 transactions to be guaranteed 456 unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node") 457 458 // submit transactions targeting epoch 2 clusters 459 for clusterIndex := range epoch2Clustering { 460 tc.SubmitTransactionToCluster(2, epoch2Clustering, uint(clusterIndex)) 461 } 462 463 waitForGuarantees.Add(expectedGuaranteesPerEpoch) 464 465 // build enough blocks to terminate the epoch 1 cluster consensus 466 // NOTE: this is here solely to improve test reliability, as it means that 467 // while we are waiting for a guarantee there is only one cluster consensus 468 // instance running (per node) rather than two. 469 tc.builder.AddBlocksWithSeals(flow.DefaultTransactionExpiry/2+1, 1) 470 471 // wait for epoch 2 transactions to be guaranteed 472 unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node") 473 474 // check epoch 1 cluster states 475 for _, clusterInfo := range epoch1Clusters { 476 for _, member := range clusterInfo.Members() { 477 tc.CheckClusterState(member, clusterInfo) 478 } 479 } 480 481 // check epoch 2 cluster states 482 for _, clusterInfo := range epoch2Clusters { 483 for _, member := range clusterInfo.Members() { 484 tc.CheckClusterState(member, clusterInfo) 485 } 486 } 487 }