github.com/onflow/flow-go@v0.33.17/engine/collection/test/cluster_switchover_test.go (about) 1 package test 2 3 import ( 4 "sync" 5 "testing" 6 "time" 7 8 "github.com/stretchr/testify/mock" 9 "github.com/stretchr/testify/require" 10 11 "github.com/onflow/flow-go/cmd/bootstrap/run" 12 "github.com/onflow/flow-go/engine/testutil" 13 testmock "github.com/onflow/flow-go/engine/testutil/mock" 14 model "github.com/onflow/flow-go/model/bootstrap" 15 "github.com/onflow/flow-go/model/flow" 16 "github.com/onflow/flow-go/model/flow/factory" 17 "github.com/onflow/flow-go/model/flow/filter" 18 "github.com/onflow/flow-go/module" 19 "github.com/onflow/flow-go/module/util" 20 "github.com/onflow/flow-go/network/channels" 21 "github.com/onflow/flow-go/network/mocknetwork" 22 "github.com/onflow/flow-go/network/stub" 23 "github.com/onflow/flow-go/state/cluster" 24 bcluster "github.com/onflow/flow-go/state/cluster/badger" 25 "github.com/onflow/flow-go/state/protocol" 26 "github.com/onflow/flow-go/state/protocol/inmem" 27 "github.com/onflow/flow-go/utils/unittest" 28 ) 29 30 // ClusterSwitchoverTestCase comprises one test case of the cluster switchover. 31 // Collection nodes are assigned to one cluster each epoch. On epoch 32 // boundaries they must gracefully terminate cluster consensus for the ending 33 // epoch and begin cluster consensus the beginning epoch. These two consensus 34 // committees co-exist for a short period at the beginning of each epoch. 35 type ClusterSwitchoverTestCase struct { 36 t *testing.T 37 conf ClusterSwitchoverTestConf 38 39 identities flow.IdentityList // identity table 40 hub *stub.Hub // mock network hub 41 root protocol.Snapshot // shared root snapshot 42 nodes []testmock.CollectionNode // collection nodes 43 sn *mocknetwork.Engine // fake consensus node engine for receiving guarantees 44 builder *unittest.EpochBuilder // utility for building epochs 45 46 // epoch counter -> cluster index -> transaction IDs 47 sentTransactions map[uint64]map[uint]flow.IdentifierList // track submitted transactions 48 } 49 50 // NewClusterSwitchoverTestCase constructs a new cluster switchover test case 51 // given the configuration, creating all dependencies and mock nodes. 52 func NewClusterSwitchoverTestCase(t *testing.T, conf ClusterSwitchoverTestConf) *ClusterSwitchoverTestCase { 53 54 tc := &ClusterSwitchoverTestCase{ 55 t: t, 56 conf: conf, 57 } 58 59 nodeInfos := unittest.PrivateNodeInfosFixture(int(conf.collectors), unittest.WithRole(flow.RoleCollection)) 60 collectors := model.ToIdentityList(nodeInfos) 61 tc.identities = unittest.CompleteIdentitySet(collectors...) 62 assignment := unittest.ClusterAssignment(tc.conf.clusters, collectors) 63 clusters, err := factory.NewClusterList(assignment, collectors) 64 require.NoError(t, err) 65 rootClusterBlocks := run.GenerateRootClusterBlocks(1, clusters) 66 rootClusterQCs := make([]flow.ClusterQCVoteData, len(rootClusterBlocks)) 67 for i, cluster := range clusters { 68 signers := make([]model.NodeInfo, 0) 69 signerIDs := make([]flow.Identifier, 0) 70 for _, identity := range nodeInfos { 71 if _, inCluster := cluster.ByNodeID(identity.NodeID); inCluster { 72 signers = append(signers, identity) 73 signerIDs = append(signerIDs, identity.NodeID) 74 } 75 } 76 qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlocks[i]) 77 require.NoError(t, err) 78 rootClusterQCs[i] = flow.ClusterQCVoteDataFromQC(&flow.QuorumCertificateWithSignerIDs{ 79 View: qc.View, 80 BlockID: qc.BlockID, 81 SignerIDs: signerIDs, 82 SigData: qc.SigData, 83 }) 84 } 85 86 tc.sentTransactions = make(map[uint64]map[uint]flow.IdentifierList) 87 tc.hub = stub.NewNetworkHub() 88 89 // create a root snapshot with the given number of initial clusters 90 root, result, seal := unittest.BootstrapFixture(tc.identities) 91 qc := unittest.QuorumCertificateFixture(unittest.QCWithRootBlockID(root.ID())) 92 setup := result.ServiceEvents[0].Event.(*flow.EpochSetup) 93 commit := result.ServiceEvents[1].Event.(*flow.EpochCommit) 94 95 setup.Assignments = unittest.ClusterAssignment(tc.conf.clusters, tc.identities) 96 commit.ClusterQCs = rootClusterQCs 97 98 seal.ResultID = result.ID() 99 tc.root, err = inmem.SnapshotFromBootstrapState(root, result, seal, qc) 100 require.NoError(t, err) 101 102 // create a mock node for each collector identity 103 for _, collector := range nodeInfos { 104 node := testutil.CollectionNode(tc.T(), tc.hub, collector, tc.root) 105 tc.nodes = append(tc.nodes, node) 106 } 107 108 // create a mock consensus node to receive collection guarantees 109 consensus := testutil.GenericNode( 110 tc.T(), 111 tc.hub, 112 tc.identities.Filter(filter.HasRole(flow.RoleConsensus))[0], 113 tc.root, 114 ) 115 tc.sn = new(mocknetwork.Engine) 116 _, err = consensus.Net.Register(channels.ReceiveGuarantees, tc.sn) 117 require.NoError(tc.T(), err) 118 119 // create an epoch builder hooked to each collector's protocol state 120 states := make([]protocol.FollowerState, 0, len(collectors)) 121 for _, node := range tc.nodes { 122 states = append(states, node.State) 123 } 124 // when building new epoch we would like to replace fixture cluster QCs with real ones, for that we need 125 // to generate them using node infos 126 tc.builder = unittest.NewEpochBuilder(tc.T(), states...).UsingCommitOpts(func(commit *flow.EpochCommit) { 127 // build a lookup table for node infos 128 nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo) 129 for _, nodeInfo := range nodeInfos { 130 nodeInfoLookup[nodeInfo.NodeID] = nodeInfo 131 } 132 133 // replace cluster QCs, with real data 134 for i, clusterQC := range commit.ClusterQCs { 135 clusterParticipants := flow.IdentifierList(clusterQC.VoterIDs).Lookup() 136 signers := make([]model.NodeInfo, 0, len(clusterParticipants)) 137 for _, signerID := range clusterQC.VoterIDs { 138 signer := nodeInfoLookup[signerID] 139 signers = append(signers, signer) 140 } 141 142 // generate root cluster block 143 rootClusterBlock := cluster.CanonicalRootBlock(commit.Counter, model.ToIdentityList(signers)) 144 // generate cluster root qc 145 qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlock) 146 require.NoError(t, err) 147 signerIDs := toSignerIDs(signers) 148 qcWithSignerIDs := &flow.QuorumCertificateWithSignerIDs{ 149 View: qc.View, 150 BlockID: qc.BlockID, 151 SignerIDs: signerIDs, 152 SigData: qc.SigData, 153 } 154 commit.ClusterQCs[i] = flow.ClusterQCVoteDataFromQC(qcWithSignerIDs) 155 } 156 }) 157 158 return tc 159 } 160 161 func toSignerIDs(signers []model.NodeInfo) []flow.Identifier { 162 signerIDs := make([]flow.Identifier, 0, len(signers)) 163 for _, signer := range signers { 164 signerIDs = append(signerIDs, signer.NodeID) 165 } 166 return signerIDs 167 } 168 169 // TestClusterSwitchover_Simple is the simplest switchover case with one single-node cluster. 170 func TestClusterSwitchover_Simple(t *testing.T) { 171 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 172 clusters: 1, 173 collectors: 1, 174 })) 175 } 176 177 // TestClusterSwitchover_MultiCollectorCluster tests switchover with a cluster 178 // containing more than one collector. 179 func TestClusterSwitchover_MultiCollectorCluster(t *testing.T) { 180 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 181 clusters: 1, 182 collectors: 2, 183 })) 184 } 185 186 // TestClusterSwitchover_MultiCluster tests cluster switchover with two clusters. 187 func TestClusterSwitchover_MultiCluster(t *testing.T) { 188 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 189 clusters: 2, 190 collectors: 2, 191 })) 192 } 193 194 // ClusterSwitchoverTestConf configures a test case. 195 type ClusterSwitchoverTestConf struct { 196 clusters uint // # of clusters each epoch 197 collectors uint // # of collectors each epoch 198 } 199 200 func (tc *ClusterSwitchoverTestCase) T() *testing.T { 201 return tc.t 202 } 203 204 // StartNodes starts all collection nodes in the suite and turns on continuous 205 // delivery in the stub network. 206 func (tc *ClusterSwitchoverTestCase) StartNodes() { 207 208 // start all node components 209 nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes)) 210 for _, node := range tc.nodes { 211 node.Start(tc.T()) 212 nodes = append(nodes, node) 213 } 214 215 unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes") 216 217 // start continuous delivery for all nodes 218 for _, node := range tc.nodes { 219 node.Net.StartConDev(10*time.Millisecond, false) 220 } 221 } 222 223 func (tc *ClusterSwitchoverTestCase) StopNodes() { 224 nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes)) 225 for _, node := range tc.nodes { 226 nodes = append(nodes, node) 227 } 228 unittest.RequireCloseBefore(tc.T(), util.AllDone(nodes...), time.Second, "could not stop nodes") 229 } 230 231 func (tc *ClusterSwitchoverTestCase) RootBlock() *flow.Header { 232 head, err := tc.root.Head() 233 require.NoError(tc.T(), err) 234 return head 235 } 236 237 func (tc *ClusterSwitchoverTestCase) ServiceAddress() flow.Address { 238 return tc.RootBlock().ChainID.Chain().ServiceAddress() 239 } 240 241 // Transaction returns a transaction which is valid for ingestion by a 242 // collection node in this test suite. 243 func (tc *ClusterSwitchoverTestCase) Transaction(opts ...func(*flow.TransactionBody)) *flow.TransactionBody { 244 tx := flow.NewTransactionBody(). 245 AddAuthorizer(tc.ServiceAddress()). 246 SetPayer(tc.ServiceAddress()). 247 SetScript(unittest.NoopTxScript()). 248 SetReferenceBlockID(tc.RootBlock().ID()) 249 250 for _, apply := range opts { 251 apply(tx) 252 } 253 254 return tx 255 } 256 257 // ExpectTransaction asserts that the test case expects the given transaction 258 // to be included in the given cluster state for the given epoch. 259 func (tc *ClusterSwitchoverTestCase) ExpectTransaction(epochCounter uint64, clusterIndex uint, txID flow.Identifier) { 260 if _, ok := tc.sentTransactions[epochCounter]; !ok { 261 tc.sentTransactions[epochCounter] = make(map[uint]flow.IdentifierList) 262 } 263 tc.T().Logf("expecting transaction %x in epoch %d for cluster %d", txID, epochCounter, clusterIndex) 264 expected := tc.sentTransactions[epochCounter][clusterIndex] 265 expected = append(expected, txID) 266 tc.sentTransactions[epochCounter][clusterIndex] = expected 267 } 268 269 // ClusterState opens and returns a read-only cluster state for the given node and cluster ID. 270 func (tc *ClusterSwitchoverTestCase) ClusterState(node testmock.CollectionNode, clusterID flow.ChainID, epoch uint64) cluster.State { 271 state, err := bcluster.OpenState(node.PublicDB, node.Tracer, node.Headers, node.ClusterPayloads, clusterID, epoch) 272 require.NoError(tc.T(), err) 273 return state 274 } 275 276 // State returns the protocol state. 277 func (tc *ClusterSwitchoverTestCase) State() protocol.State { 278 return tc.nodes[0].State 279 } 280 281 // Collector returns the mock node for the collector with the given ID. 282 func (tc *ClusterSwitchoverTestCase) Collector(id flow.Identifier) testmock.CollectionNode { 283 for _, node := range tc.nodes { 284 if node.Me.NodeID() == id { 285 return node 286 } 287 } 288 tc.T().FailNow() 289 return testmock.CollectionNode{} 290 } 291 292 // Clusters returns the clusters for the current epoch. 293 func (tc *ClusterSwitchoverTestCase) Clusters(epoch protocol.Epoch) []protocol.Cluster { 294 clustering, err := epoch.Clustering() 295 require.NoError(tc.T(), err) 296 297 clusters := make([]protocol.Cluster, 0, len(clustering)) 298 for i := uint(0); i < uint(len(clustering)); i++ { 299 cluster, err := epoch.Cluster(i) 300 require.NoError(tc.T(), err) 301 clusters = append(clusters, cluster) 302 } 303 304 return clusters 305 } 306 307 // BlockInEpoch returns the highest block that exists within the bounds of the 308 // epoch with the given epoch counter. 309 func (tc *ClusterSwitchoverTestCase) BlockInEpoch(epochCounter uint64) *flow.Header { 310 root := tc.RootBlock() 311 312 for height := root.Height; ; height++ { 313 curr := tc.State().AtHeight(height) 314 next := tc.State().AtHeight(height + 1) 315 curCounter, err := curr.Epochs().Current().Counter() 316 require.NoError(tc.T(), err) 317 nextCounter, err := next.Epochs().Current().Counter() 318 // if we reach a point where the next block doesn't exist, but the 319 // current block has the correct counter, return the current block 320 if err != nil && curCounter == epochCounter { 321 head, err := curr.Head() 322 require.NoError(tc.T(), err) 323 return head 324 } 325 326 // otherwise, wait until we reach the block where the next block is in 327 // the next epoch - this is the highest block in the requested epoch 328 if curCounter == epochCounter && nextCounter == epochCounter+1 { 329 head, err := curr.Head() 330 require.NoError(tc.T(), err) 331 return head 332 } 333 } 334 } 335 336 // SubmitTransactionToCluster submits a transaction to the given cluster in 337 // the given epoch and marks the transaction as expected for inclusion in 338 // the corresponding cluster state. 339 func (tc *ClusterSwitchoverTestCase) SubmitTransactionToCluster( 340 epochCounter uint64, // the epoch we are submitting the transacting w.r.t. 341 clustering flow.ClusterList, // the clustering for the epoch 342 clusterIndex uint, // the index of the cluster we are targetting 343 ) { 344 345 clusterMembers := clustering[int(clusterIndex)] 346 // get any block within the target epoch as the transaction's reference block 347 refBlock := tc.BlockInEpoch(epochCounter) 348 tx := tc.Transaction(func(tx *flow.TransactionBody) { 349 tx.SetReferenceBlockID(refBlock.ID()) 350 }) 351 clusterTx := unittest.AlterTransactionForCluster(*tx, clustering, clusterMembers, nil) 352 tc.ExpectTransaction(epochCounter, clusterIndex, clusterTx.ID()) 353 354 // submit the transaction to any collector in this cluster 355 err := tc.Collector(clusterMembers[0].NodeID).IngestionEngine.ProcessTransaction(&clusterTx) 356 require.NoError(tc.T(), err) 357 } 358 359 // CheckClusterState checks the cluster state of the given node (within the given 360 // cluster) and asserts that only transaction specified by ExpectTransaction are 361 // included. 362 func (tc *ClusterSwitchoverTestCase) CheckClusterState( 363 identity *flow.Identity, 364 clusterInfo protocol.Cluster, 365 ) { 366 node := tc.Collector(identity.NodeID) 367 state := tc.ClusterState(node, clusterInfo.ChainID(), clusterInfo.EpochCounter()) 368 expected := tc.sentTransactions[clusterInfo.EpochCounter()][clusterInfo.Index()] 369 unittest.NewClusterStateChecker(state). 370 ExpectTxCount(len(expected)). 371 ExpectContainsTx(expected...). 372 Assert(tc.T()) 373 } 374 375 // Timeout returns the timeout for async tasks for this test case. 376 func (tc *ClusterSwitchoverTestCase) Timeout() time.Duration { 377 // 60s + 10s for each collector 378 // locally the whole suite takes 379 // * ~8s when run alone 380 // * ~15-20s when run in parallel with other packages (default) 381 return 60*time.Second + 10*time.Second*time.Duration(tc.conf.collectors) 382 } 383 384 // RunTestCase comprises the core test logic for cluster switchover. We build 385 // an epoch, which triggers the beginning of the epoch 2 cluster consensus, then 386 // send transactions targeting clusters from both epochs while both are running. 387 func RunTestCase(tc *ClusterSwitchoverTestCase) { 388 389 tc.StartNodes() 390 defer tc.StopNodes() 391 392 // keep track of guarantees received at the mock consensus node 393 // when a guarantee is received, it indicates that the sender has finalized 394 // the corresponding cluster block 395 expectedGuaranteesPerEpoch := int(tc.conf.collectors) 396 waitForGuarantees := new(sync.WaitGroup) 397 waitForGuarantees.Add(expectedGuaranteesPerEpoch) 398 tc.sn.On("Process", mock.Anything, mock.Anything, mock.Anything). 399 Return(nil). 400 Run(func(args mock.Arguments) { 401 id, ok := args[1].(flow.Identifier) 402 require.True(tc.T(), ok) 403 _, ok = args[2].(*flow.CollectionGuarantee) 404 tc.T().Log("got guarantee from", id.String()) 405 require.True(tc.T(), ok) 406 waitForGuarantees.Done() 407 }). 408 Times(expectedGuaranteesPerEpoch * 2) 409 410 // build the epoch, ending on the first block on the next epoch 411 tc.builder.BuildEpoch().CompleteEpoch() 412 // build halfway through the grace period for the epoch 1 cluster 413 tc.builder.BuildBlocks(flow.DefaultTransactionExpiry / 2) 414 415 epoch1 := tc.State().Final().Epochs().Previous() 416 epoch2 := tc.State().Final().Epochs().Current() 417 418 epoch1Clusters := tc.Clusters(epoch1) 419 epoch2Clusters := tc.Clusters(epoch2) 420 epoch1Clustering, err := epoch1.Clustering() 421 require.NoError(tc.T(), err) 422 epoch2Clustering, err := epoch2.Clustering() 423 require.NoError(tc.T(), err) 424 425 // submit transactions targeting epoch 1 clusters 426 for clusterIndex := range epoch1Clustering { 427 tc.SubmitTransactionToCluster(1, epoch1Clustering, uint(clusterIndex)) 428 } 429 430 // wait for epoch 1 transactions to be guaranteed 431 unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node") 432 433 // submit transactions targeting epoch 2 clusters 434 for clusterIndex := range epoch2Clustering { 435 tc.SubmitTransactionToCluster(2, epoch2Clustering, uint(clusterIndex)) 436 } 437 438 waitForGuarantees.Add(expectedGuaranteesPerEpoch) 439 440 // build enough blocks to terminate the epoch 1 cluster consensus 441 // NOTE: this is here solely to improve test reliability, as it means that 442 // while we are waiting for a guarantee there is only one cluster consensus 443 // instance running (per node) rather than two. 444 tc.builder.BuildBlocks(flow.DefaultTransactionExpiry/2 + 1) 445 446 // wait for epoch 2 transactions to be guaranteed 447 unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node") 448 449 // check epoch 1 cluster states 450 for _, clusterInfo := range epoch1Clusters { 451 for _, member := range clusterInfo.Members() { 452 tc.CheckClusterState(member, clusterInfo) 453 } 454 } 455 456 // check epoch 2 cluster states 457 for _, clusterInfo := range epoch2Clusters { 458 for _, member := range clusterInfo.Members() { 459 tc.CheckClusterState(member, clusterInfo) 460 } 461 } 462 }