github.com/koko1123/flow-go-1@v0.29.6/engine/collection/test/cluster_switchover_test.go (about) 1 package test 2 3 import ( 4 "context" 5 "sync" 6 "testing" 7 "time" 8 9 "github.com/stretchr/testify/mock" 10 "github.com/stretchr/testify/require" 11 12 "github.com/koko1123/flow-go-1/cmd/bootstrap/run" 13 "github.com/koko1123/flow-go-1/engine/testutil" 14 testmock "github.com/koko1123/flow-go-1/engine/testutil/mock" 15 model "github.com/koko1123/flow-go-1/model/bootstrap" 16 "github.com/koko1123/flow-go-1/model/flow" 17 "github.com/koko1123/flow-go-1/model/flow/factory" 18 "github.com/koko1123/flow-go-1/model/flow/filter" 19 "github.com/koko1123/flow-go-1/module" 20 "github.com/koko1123/flow-go-1/module/irrecoverable" 21 "github.com/koko1123/flow-go-1/module/util" 22 "github.com/koko1123/flow-go-1/network/channels" 23 "github.com/koko1123/flow-go-1/network/mocknetwork" 24 "github.com/koko1123/flow-go-1/network/stub" 25 "github.com/koko1123/flow-go-1/state/cluster" 26 bcluster "github.com/koko1123/flow-go-1/state/cluster/badger" 27 "github.com/koko1123/flow-go-1/state/protocol" 28 "github.com/koko1123/flow-go-1/state/protocol/inmem" 29 "github.com/koko1123/flow-go-1/utils/unittest" 30 ) 31 32 // ClusterSwitchoverTestCase comprises one test case of the cluster switchover. 33 // Collection nodes are assigned to one cluster each epoch. On epoch 34 // boundaries they must gracefully terminate cluster consensus for the ending 35 // epoch and begin cluster consensus the beginning epoch. These two consensus 36 // committees co-exist for a short period at the beginning of each epoch. 37 type ClusterSwitchoverTestCase struct { 38 t *testing.T 39 conf ClusterSwitchoverTestConf 40 41 identities flow.IdentityList // identity table 42 hub *stub.Hub // mock network hub 43 root protocol.Snapshot // shared root snapshot 44 nodes []testmock.CollectionNode // collection nodes 45 sn *mocknetwork.Engine // fake consensus node engine for receiving guarantees 46 builder *unittest.EpochBuilder // utility for building epochs 47 48 // epoch counter -> cluster index -> transaction IDs 49 sentTransactions map[uint64]map[uint]flow.IdentifierList // track submitted transactions 50 } 51 52 // NewClusterSwitchoverTestCase constructs a new cluster switchover test case 53 // given the configuration, creating all dependencies and mock nodes. 54 func NewClusterSwitchoverTestCase(t *testing.T, conf ClusterSwitchoverTestConf) *ClusterSwitchoverTestCase { 55 56 tc := &ClusterSwitchoverTestCase{ 57 t: t, 58 conf: conf, 59 } 60 61 nodeInfos := unittest.PrivateNodeInfosFixture(int(conf.collectors), unittest.WithRole(flow.RoleCollection)) 62 collectors := model.ToIdentityList(nodeInfos) 63 tc.identities = unittest.CompleteIdentitySet(collectors...) 64 assignment := unittest.ClusterAssignment(tc.conf.clusters, collectors) 65 clusters, err := factory.NewClusterList(assignment, collectors) 66 require.NoError(t, err) 67 rootClusterBlocks := run.GenerateRootClusterBlocks(1, clusters) 68 rootClusterQCs := make([]flow.ClusterQCVoteData, len(rootClusterBlocks)) 69 for i, cluster := range clusters { 70 signers := make([]model.NodeInfo, 0) 71 signerIDs := make([]flow.Identifier, 0) 72 for _, identity := range nodeInfos { 73 if _, inCluster := cluster.ByNodeID(identity.NodeID); inCluster { 74 signers = append(signers, identity) 75 signerIDs = append(signerIDs, identity.NodeID) 76 } 77 } 78 qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlocks[i]) 79 require.NoError(t, err) 80 rootClusterQCs[i] = flow.ClusterQCVoteDataFromQC(&flow.QuorumCertificateWithSignerIDs{ 81 View: qc.View, 82 BlockID: qc.BlockID, 83 SignerIDs: signerIDs, 84 SigData: qc.SigData, 85 }) 86 } 87 88 tc.sentTransactions = make(map[uint64]map[uint]flow.IdentifierList) 89 tc.hub = stub.NewNetworkHub() 90 91 // create a root snapshot with the given number of initial clusters 92 root, result, seal := unittest.BootstrapFixture(tc.identities) 93 qc := unittest.QuorumCertificateFixture(unittest.QCWithBlockID(root.ID())) 94 setup := result.ServiceEvents[0].Event.(*flow.EpochSetup) 95 commit := result.ServiceEvents[1].Event.(*flow.EpochCommit) 96 97 setup.Assignments = unittest.ClusterAssignment(tc.conf.clusters, tc.identities) 98 commit.ClusterQCs = rootClusterQCs 99 100 seal.ResultID = result.ID() 101 tc.root, err = inmem.SnapshotFromBootstrapState(root, result, seal, qc) 102 require.NoError(t, err) 103 104 cancelCtx, cancel := context.WithCancel(context.Background()) 105 defer cancel() 106 ctx := irrecoverable.NewMockSignalerContext(t, cancelCtx) 107 defer cancel() 108 109 // create a mock node for each collector identity 110 for _, collector := range nodeInfos { 111 node := testutil.CollectionNode(tc.T(), ctx, tc.hub, collector, tc.root) 112 tc.nodes = append(tc.nodes, node) 113 } 114 115 // create a mock consensus node to receive collection guarantees 116 consensus := testutil.GenericNode( 117 tc.T(), 118 tc.hub, 119 tc.identities.Filter(filter.HasRole(flow.RoleConsensus))[0], 120 tc.root, 121 ) 122 tc.sn = new(mocknetwork.Engine) 123 _, err = consensus.Net.Register(channels.ReceiveGuarantees, tc.sn) 124 require.NoError(tc.T(), err) 125 126 // create an epoch builder hooked to each collector's protocol state 127 states := make([]protocol.MutableState, 0, len(collectors)) 128 for _, node := range tc.nodes { 129 states = append(states, node.State) 130 } 131 // when building new epoch we would like to replace fixture cluster QCs with real ones, for that we need 132 // to generate them using node infos 133 tc.builder = unittest.NewEpochBuilder(tc.T(), states...).UsingCommitOpts(func(commit *flow.EpochCommit) { 134 // build a lookup table for node infos 135 nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo) 136 for _, nodeInfo := range nodeInfos { 137 nodeInfoLookup[nodeInfo.NodeID] = nodeInfo 138 } 139 140 // replace cluster QCs, with real data 141 for i, clusterQC := range commit.ClusterQCs { 142 clusterParticipants := flow.IdentifierList(clusterQC.VoterIDs).Lookup() 143 signers := make([]model.NodeInfo, 0, len(clusterParticipants)) 144 for _, signerID := range clusterQC.VoterIDs { 145 signer := nodeInfoLookup[signerID] 146 signers = append(signers, signer) 147 } 148 149 // generate root cluster block 150 rootClusterBlock := cluster.CanonicalRootBlock(commit.Counter, model.ToIdentityList(signers)) 151 // generate cluster root qc 152 qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlock) 153 require.NoError(t, err) 154 signerIDs := toSignerIDs(signers) 155 qcWithSignerIDs := &flow.QuorumCertificateWithSignerIDs{ 156 View: qc.View, 157 BlockID: qc.BlockID, 158 SignerIDs: signerIDs, 159 SigData: qc.SigData, 160 } 161 commit.ClusterQCs[i] = flow.ClusterQCVoteDataFromQC(qcWithSignerIDs) 162 } 163 }) 164 165 return tc 166 } 167 168 func toSignerIDs(signers []model.NodeInfo) []flow.Identifier { 169 signerIDs := make([]flow.Identifier, 0, len(signers)) 170 for _, signer := range signers { 171 signerIDs = append(signerIDs, signer.NodeID) 172 } 173 return signerIDs 174 } 175 176 // TestClusterSwitchover_Simple is the simplest switchover case with one single-node cluster. 177 func TestClusterSwitchover_Simple(t *testing.T) { 178 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 179 clusters: 1, 180 collectors: 1, 181 })) 182 } 183 184 // TestClusterSwitchover_MultiCollectorCluster tests switchover with a cluster 185 // containing more than one collector. 186 func TestClusterSwitchover_MultiCollectorCluster(t *testing.T) { 187 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 188 clusters: 1, 189 collectors: 2, 190 })) 191 } 192 193 // TestClusterSwitchover_MultiCluster tests cluster switchover with two clusters. 194 func TestClusterSwitchover_MultiCluster(t *testing.T) { 195 RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{ 196 clusters: 2, 197 collectors: 2, 198 })) 199 } 200 201 // ClusterSwitchoverTestConf configures a test case. 202 type ClusterSwitchoverTestConf struct { 203 clusters uint // # of clusters each epoch 204 collectors uint // # of collectors each epoch 205 } 206 207 func (tc *ClusterSwitchoverTestCase) T() *testing.T { 208 return tc.t 209 } 210 211 // StartNodes starts all collection nodes in the suite and turns on continuous 212 // delivery in the stub network. 213 func (tc *ClusterSwitchoverTestCase) StartNodes() { 214 215 // start all node components 216 nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes)) 217 for _, node := range tc.nodes { 218 nodes = append(nodes, node) 219 } 220 221 unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), time.Second, "could not start nodes") 222 223 // start continuous delivery for all nodes 224 for _, node := range tc.nodes { 225 node.Net.StartConDev(10*time.Millisecond, false) 226 } 227 } 228 229 func (tc *ClusterSwitchoverTestCase) StopNodes() { 230 nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes)) 231 for _, node := range tc.nodes { 232 nodes = append(nodes, node) 233 } 234 unittest.RequireCloseBefore(tc.T(), util.AllDone(nodes...), time.Second, "could not stop nodes") 235 } 236 237 func (tc *ClusterSwitchoverTestCase) RootBlock() *flow.Header { 238 head, err := tc.root.Head() 239 require.NoError(tc.T(), err) 240 return head 241 } 242 243 func (tc *ClusterSwitchoverTestCase) ServiceAddress() flow.Address { 244 return tc.RootBlock().ChainID.Chain().ServiceAddress() 245 } 246 247 // Transaction returns a transaction which is valid for ingestion by a 248 // collection node in this test suite. 249 func (tc *ClusterSwitchoverTestCase) Transaction(opts ...func(*flow.TransactionBody)) *flow.TransactionBody { 250 tx := flow.NewTransactionBody(). 251 AddAuthorizer(tc.ServiceAddress()). 252 SetPayer(tc.ServiceAddress()). 253 SetScript(unittest.NoopTxScript()). 254 SetReferenceBlockID(tc.RootBlock().ID()) 255 256 for _, apply := range opts { 257 apply(tx) 258 } 259 260 return tx 261 } 262 263 // ExpectTransaction asserts that the test case expects the given transaction 264 // to be included in the given cluster state for the given epoch. 265 func (tc *ClusterSwitchoverTestCase) ExpectTransaction(epochCounter uint64, clusterIndex uint, txID flow.Identifier) { 266 if _, ok := tc.sentTransactions[epochCounter]; !ok { 267 tc.sentTransactions[epochCounter] = make(map[uint]flow.IdentifierList) 268 } 269 tc.T().Logf("expecting transaction %x in epoch %d for cluster %d", txID, epochCounter, clusterIndex) 270 expected := tc.sentTransactions[epochCounter][clusterIndex] 271 expected = append(expected, txID) 272 tc.sentTransactions[epochCounter][clusterIndex] = expected 273 } 274 275 // ClusterState opens and returns a read-only cluster state for the given node and cluster ID. 276 func (tc *ClusterSwitchoverTestCase) ClusterState(node testmock.CollectionNode, clusterID flow.ChainID) cluster.State { 277 state, err := bcluster.OpenState(node.PublicDB, node.Tracer, node.Headers, node.ClusterPayloads, clusterID) 278 require.NoError(tc.T(), err) 279 return state 280 } 281 282 // State returns the protocol state. 283 func (tc *ClusterSwitchoverTestCase) State() protocol.State { 284 return tc.nodes[0].State 285 } 286 287 // Collector returns the mock node for the collector with the given ID. 288 func (tc *ClusterSwitchoverTestCase) Collector(id flow.Identifier) testmock.CollectionNode { 289 for _, node := range tc.nodes { 290 if node.Me.NodeID() == id { 291 return node 292 } 293 } 294 tc.T().FailNow() 295 return testmock.CollectionNode{} 296 } 297 298 // Clusters returns the clusters for the current epoch. 299 func (tc *ClusterSwitchoverTestCase) Clusters(epoch protocol.Epoch) []protocol.Cluster { 300 clustering, err := epoch.Clustering() 301 require.NoError(tc.T(), err) 302 303 clusters := make([]protocol.Cluster, 0, len(clustering)) 304 for i := uint(0); i < uint(len(clustering)); i++ { 305 cluster, err := epoch.Cluster(i) 306 require.NoError(tc.T(), err) 307 clusters = append(clusters, cluster) 308 } 309 310 return clusters 311 } 312 313 // BlockInEpoch returns the highest block that exists within the bounds of the 314 // epoch with the given epoch counter. 315 func (tc *ClusterSwitchoverTestCase) BlockInEpoch(epochCounter uint64) *flow.Header { 316 root := tc.RootBlock() 317 318 for height := root.Height; ; height++ { 319 curr := tc.State().AtHeight(height) 320 next := tc.State().AtHeight(height + 1) 321 curCounter, err := curr.Epochs().Current().Counter() 322 require.NoError(tc.T(), err) 323 nextCounter, err := next.Epochs().Current().Counter() 324 // if we reach a point where the next block doesn't exist, but the 325 // current block has the correct counter, return the current block 326 if err != nil && curCounter == epochCounter { 327 head, err := curr.Head() 328 require.NoError(tc.T(), err) 329 return head 330 } 331 332 // otherwise, wait until we reach the block where the next block is in 333 // the next epoch - this is the highest block in the requested epoch 334 if curCounter == epochCounter && nextCounter == epochCounter+1 { 335 head, err := curr.Head() 336 require.NoError(tc.T(), err) 337 return head 338 } 339 } 340 } 341 342 // SubmitTransactionToCluster submits a transaction to the given cluster in 343 // the given epoch and marks the transaction as expected for inclusion in 344 // the corresponding cluster state. 345 func (tc *ClusterSwitchoverTestCase) SubmitTransactionToCluster( 346 epochCounter uint64, // the epoch we are submitting the transacting w.r.t. 347 clustering flow.ClusterList, // the clustering for the epoch 348 clusterIndex uint, // the index of the cluster we are targetting 349 ) { 350 351 clusterMembers := clustering[int(clusterIndex)] 352 // get any block within the target epoch as the transaction's reference block 353 refBlock := tc.BlockInEpoch(epochCounter) 354 tx := tc.Transaction(func(tx *flow.TransactionBody) { 355 tx.SetReferenceBlockID(refBlock.ID()) 356 }) 357 clusterTx := unittest.AlterTransactionForCluster(*tx, clustering, clusterMembers, nil) 358 tc.ExpectTransaction(epochCounter, clusterIndex, clusterTx.ID()) 359 360 // submit the transaction to any collector in this cluster 361 err := tc.Collector(clusterMembers[0].NodeID).IngestionEngine.ProcessTransaction(&clusterTx) 362 require.NoError(tc.T(), err) 363 } 364 365 // CheckClusterState checks the cluster state of the given node (within the given 366 // cluster) and asserts that only transaction specified by ExpectTransaction are 367 // included. 368 func (tc *ClusterSwitchoverTestCase) CheckClusterState( 369 identity *flow.Identity, 370 clusterInfo protocol.Cluster, 371 ) { 372 node := tc.Collector(identity.NodeID) 373 state := tc.ClusterState(node, clusterInfo.ChainID()) 374 expected := tc.sentTransactions[clusterInfo.EpochCounter()][clusterInfo.Index()] 375 unittest.NewClusterStateChecker(state). 376 ExpectTxCount(len(expected)). 377 ExpectContainsTx(expected...). 378 Assert(tc.T()) 379 } 380 381 // Timeout returns the timeout for async tasks for this test case. 382 func (tc *ClusterSwitchoverTestCase) Timeout() time.Duration { 383 // 60s + 10s for each collector 384 // locally the whole suite takes 385 // * ~8s when run alone 386 // * ~15-20s when run in parallel with other packages (default) 387 return 60*time.Second + 10*time.Second*time.Duration(tc.conf.collectors) 388 } 389 390 // RunTestCase comprises the core test logic for cluster switchover. We build 391 // an epoch, which triggers the beginning of the epoch 2 cluster consensus, then 392 // send transactions targeting clusters from both epochs while both are running. 393 func RunTestCase(tc *ClusterSwitchoverTestCase) { 394 395 tc.StartNodes() 396 defer tc.StopNodes() 397 398 // keep track of guarantees received at the mock consensus node 399 // when a guarantee is received, it indicates that the sender has finalized 400 // the corresponding cluster block 401 expectedGuaranteesPerEpoch := int(tc.conf.collectors) 402 waitForGuarantees := new(sync.WaitGroup) 403 waitForGuarantees.Add(expectedGuaranteesPerEpoch) 404 tc.sn.On("Process", mock.Anything, mock.Anything, mock.Anything). 405 Return(nil). 406 Run(func(args mock.Arguments) { 407 id, ok := args[1].(flow.Identifier) 408 require.True(tc.T(), ok) 409 _, ok = args[2].(*flow.CollectionGuarantee) 410 tc.T().Log("got guarantee from", id.String()) 411 require.True(tc.T(), ok) 412 waitForGuarantees.Done() 413 }). 414 Times(expectedGuaranteesPerEpoch * 2) 415 416 // build the epoch, ending on the first block on the next epoch 417 tc.builder.BuildEpoch().CompleteEpoch() 418 // build halfway through the grace period for the epoch 1 cluster 419 tc.builder.BuildBlocks(flow.DefaultTransactionExpiry / 2) 420 421 epoch1 := tc.State().Final().Epochs().Previous() 422 epoch2 := tc.State().Final().Epochs().Current() 423 424 epoch1Clusters := tc.Clusters(epoch1) 425 epoch2Clusters := tc.Clusters(epoch2) 426 epoch1Clustering, err := epoch1.Clustering() 427 require.NoError(tc.T(), err) 428 epoch2Clustering, err := epoch2.Clustering() 429 require.NoError(tc.T(), err) 430 431 // submit transactions targeting epoch 1 clusters 432 for clusterIndex := range epoch1Clustering { 433 tc.SubmitTransactionToCluster(1, epoch1Clustering, uint(clusterIndex)) 434 } 435 436 // wait for epoch 1 transactions to be guaranteed 437 unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node") 438 439 // submit transactions targeting epoch 2 clusters 440 for clusterIndex := range epoch2Clustering { 441 tc.SubmitTransactionToCluster(2, epoch2Clustering, uint(clusterIndex)) 442 } 443 444 waitForGuarantees.Add(expectedGuaranteesPerEpoch) 445 446 // build enough blocks to terminate the epoch 1 cluster consensus 447 // NOTE: this is here solely to improve test reliability, as it means that 448 // while we are waiting for a guarantee there is only one cluster consensus 449 // instance running (per node) rather than two. 450 tc.builder.BuildBlocks(flow.DefaultTransactionExpiry/2 + 1) 451 452 // wait for epoch 2 transactions to be guaranteed 453 unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node") 454 455 // check epoch 1 cluster states 456 for _, clusterInfo := range epoch1Clusters { 457 for _, member := range clusterInfo.Members() { 458 tc.CheckClusterState(member, clusterInfo) 459 } 460 } 461 462 // check epoch 2 cluster states 463 for _, clusterInfo := range epoch2Clusters { 464 for _, member := range clusterInfo.Members() { 465 tc.CheckClusterState(member, clusterInfo) 466 } 467 } 468 }