github.com/onflow/flow-go@v0.33.17/engine/collection/test/cluster_switchover_test.go (about)

     1  package test
     2  
     3  import (
     4  	"sync"
     5  	"testing"
     6  	"time"
     7  
     8  	"github.com/stretchr/testify/mock"
     9  	"github.com/stretchr/testify/require"
    10  
    11  	"github.com/onflow/flow-go/cmd/bootstrap/run"
    12  	"github.com/onflow/flow-go/engine/testutil"
    13  	testmock "github.com/onflow/flow-go/engine/testutil/mock"
    14  	model "github.com/onflow/flow-go/model/bootstrap"
    15  	"github.com/onflow/flow-go/model/flow"
    16  	"github.com/onflow/flow-go/model/flow/factory"
    17  	"github.com/onflow/flow-go/model/flow/filter"
    18  	"github.com/onflow/flow-go/module"
    19  	"github.com/onflow/flow-go/module/util"
    20  	"github.com/onflow/flow-go/network/channels"
    21  	"github.com/onflow/flow-go/network/mocknetwork"
    22  	"github.com/onflow/flow-go/network/stub"
    23  	"github.com/onflow/flow-go/state/cluster"
    24  	bcluster "github.com/onflow/flow-go/state/cluster/badger"
    25  	"github.com/onflow/flow-go/state/protocol"
    26  	"github.com/onflow/flow-go/state/protocol/inmem"
    27  	"github.com/onflow/flow-go/utils/unittest"
    28  )
    29  
    30  // ClusterSwitchoverTestCase comprises one test case of the cluster switchover.
    31  // Collection nodes are assigned to one cluster each epoch. On epoch
    32  // boundaries they must gracefully terminate cluster consensus for the ending
    33  // epoch and begin cluster consensus the beginning epoch. These two consensus
    34  // committees co-exist for a short period at the beginning of each epoch.
    35  type ClusterSwitchoverTestCase struct {
    36  	t    *testing.T
    37  	conf ClusterSwitchoverTestConf
    38  
    39  	identities flow.IdentityList         // identity table
    40  	hub        *stub.Hub                 // mock network hub
    41  	root       protocol.Snapshot         // shared root snapshot
    42  	nodes      []testmock.CollectionNode // collection nodes
    43  	sn         *mocknetwork.Engine       // fake consensus node engine for receiving guarantees
    44  	builder    *unittest.EpochBuilder    // utility for building epochs
    45  
    46  	// epoch counter -> cluster index -> transaction IDs
    47  	sentTransactions map[uint64]map[uint]flow.IdentifierList // track submitted transactions
    48  }
    49  
    50  // NewClusterSwitchoverTestCase constructs a new cluster switchover test case
    51  // given the configuration, creating all dependencies and mock nodes.
    52  func NewClusterSwitchoverTestCase(t *testing.T, conf ClusterSwitchoverTestConf) *ClusterSwitchoverTestCase {
    53  
    54  	tc := &ClusterSwitchoverTestCase{
    55  		t:    t,
    56  		conf: conf,
    57  	}
    58  
    59  	nodeInfos := unittest.PrivateNodeInfosFixture(int(conf.collectors), unittest.WithRole(flow.RoleCollection))
    60  	collectors := model.ToIdentityList(nodeInfos)
    61  	tc.identities = unittest.CompleteIdentitySet(collectors...)
    62  	assignment := unittest.ClusterAssignment(tc.conf.clusters, collectors)
    63  	clusters, err := factory.NewClusterList(assignment, collectors)
    64  	require.NoError(t, err)
    65  	rootClusterBlocks := run.GenerateRootClusterBlocks(1, clusters)
    66  	rootClusterQCs := make([]flow.ClusterQCVoteData, len(rootClusterBlocks))
    67  	for i, cluster := range clusters {
    68  		signers := make([]model.NodeInfo, 0)
    69  		signerIDs := make([]flow.Identifier, 0)
    70  		for _, identity := range nodeInfos {
    71  			if _, inCluster := cluster.ByNodeID(identity.NodeID); inCluster {
    72  				signers = append(signers, identity)
    73  				signerIDs = append(signerIDs, identity.NodeID)
    74  			}
    75  		}
    76  		qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlocks[i])
    77  		require.NoError(t, err)
    78  		rootClusterQCs[i] = flow.ClusterQCVoteDataFromQC(&flow.QuorumCertificateWithSignerIDs{
    79  			View:      qc.View,
    80  			BlockID:   qc.BlockID,
    81  			SignerIDs: signerIDs,
    82  			SigData:   qc.SigData,
    83  		})
    84  	}
    85  
    86  	tc.sentTransactions = make(map[uint64]map[uint]flow.IdentifierList)
    87  	tc.hub = stub.NewNetworkHub()
    88  
    89  	// create a root snapshot with the given number of initial clusters
    90  	root, result, seal := unittest.BootstrapFixture(tc.identities)
    91  	qc := unittest.QuorumCertificateFixture(unittest.QCWithRootBlockID(root.ID()))
    92  	setup := result.ServiceEvents[0].Event.(*flow.EpochSetup)
    93  	commit := result.ServiceEvents[1].Event.(*flow.EpochCommit)
    94  
    95  	setup.Assignments = unittest.ClusterAssignment(tc.conf.clusters, tc.identities)
    96  	commit.ClusterQCs = rootClusterQCs
    97  
    98  	seal.ResultID = result.ID()
    99  	tc.root, err = inmem.SnapshotFromBootstrapState(root, result, seal, qc)
   100  	require.NoError(t, err)
   101  
   102  	// create a mock node for each collector identity
   103  	for _, collector := range nodeInfos {
   104  		node := testutil.CollectionNode(tc.T(), tc.hub, collector, tc.root)
   105  		tc.nodes = append(tc.nodes, node)
   106  	}
   107  
   108  	// create a mock consensus node to receive collection guarantees
   109  	consensus := testutil.GenericNode(
   110  		tc.T(),
   111  		tc.hub,
   112  		tc.identities.Filter(filter.HasRole(flow.RoleConsensus))[0],
   113  		tc.root,
   114  	)
   115  	tc.sn = new(mocknetwork.Engine)
   116  	_, err = consensus.Net.Register(channels.ReceiveGuarantees, tc.sn)
   117  	require.NoError(tc.T(), err)
   118  
   119  	// create an epoch builder hooked to each collector's protocol state
   120  	states := make([]protocol.FollowerState, 0, len(collectors))
   121  	for _, node := range tc.nodes {
   122  		states = append(states, node.State)
   123  	}
   124  	// when building new epoch we would like to replace fixture cluster QCs with real ones, for that we need
   125  	// to generate them using node infos
   126  	tc.builder = unittest.NewEpochBuilder(tc.T(), states...).UsingCommitOpts(func(commit *flow.EpochCommit) {
   127  		// build a lookup table for node infos
   128  		nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo)
   129  		for _, nodeInfo := range nodeInfos {
   130  			nodeInfoLookup[nodeInfo.NodeID] = nodeInfo
   131  		}
   132  
   133  		// replace cluster QCs, with real data
   134  		for i, clusterQC := range commit.ClusterQCs {
   135  			clusterParticipants := flow.IdentifierList(clusterQC.VoterIDs).Lookup()
   136  			signers := make([]model.NodeInfo, 0, len(clusterParticipants))
   137  			for _, signerID := range clusterQC.VoterIDs {
   138  				signer := nodeInfoLookup[signerID]
   139  				signers = append(signers, signer)
   140  			}
   141  
   142  			// generate root cluster block
   143  			rootClusterBlock := cluster.CanonicalRootBlock(commit.Counter, model.ToIdentityList(signers))
   144  			// generate cluster root qc
   145  			qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlock)
   146  			require.NoError(t, err)
   147  			signerIDs := toSignerIDs(signers)
   148  			qcWithSignerIDs := &flow.QuorumCertificateWithSignerIDs{
   149  				View:      qc.View,
   150  				BlockID:   qc.BlockID,
   151  				SignerIDs: signerIDs,
   152  				SigData:   qc.SigData,
   153  			}
   154  			commit.ClusterQCs[i] = flow.ClusterQCVoteDataFromQC(qcWithSignerIDs)
   155  		}
   156  	})
   157  
   158  	return tc
   159  }
   160  
   161  func toSignerIDs(signers []model.NodeInfo) []flow.Identifier {
   162  	signerIDs := make([]flow.Identifier, 0, len(signers))
   163  	for _, signer := range signers {
   164  		signerIDs = append(signerIDs, signer.NodeID)
   165  	}
   166  	return signerIDs
   167  }
   168  
   169  // TestClusterSwitchover_Simple is the simplest switchover case with one single-node cluster.
   170  func TestClusterSwitchover_Simple(t *testing.T) {
   171  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   172  		clusters:   1,
   173  		collectors: 1,
   174  	}))
   175  }
   176  
   177  // TestClusterSwitchover_MultiCollectorCluster tests switchover with a cluster
   178  // containing more than one collector.
   179  func TestClusterSwitchover_MultiCollectorCluster(t *testing.T) {
   180  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   181  		clusters:   1,
   182  		collectors: 2,
   183  	}))
   184  }
   185  
   186  // TestClusterSwitchover_MultiCluster tests cluster switchover with two clusters.
   187  func TestClusterSwitchover_MultiCluster(t *testing.T) {
   188  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   189  		clusters:   2,
   190  		collectors: 2,
   191  	}))
   192  }
   193  
   194  // ClusterSwitchoverTestConf configures a test case.
   195  type ClusterSwitchoverTestConf struct {
   196  	clusters   uint // # of clusters each epoch
   197  	collectors uint // # of collectors each epoch
   198  }
   199  
   200  func (tc *ClusterSwitchoverTestCase) T() *testing.T {
   201  	return tc.t
   202  }
   203  
   204  // StartNodes starts all collection nodes in the suite and turns on continuous
   205  // delivery in the stub network.
   206  func (tc *ClusterSwitchoverTestCase) StartNodes() {
   207  
   208  	// start all node components
   209  	nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes))
   210  	for _, node := range tc.nodes {
   211  		node.Start(tc.T())
   212  		nodes = append(nodes, node)
   213  	}
   214  
   215  	unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes")
   216  
   217  	// start continuous delivery for all nodes
   218  	for _, node := range tc.nodes {
   219  		node.Net.StartConDev(10*time.Millisecond, false)
   220  	}
   221  }
   222  
   223  func (tc *ClusterSwitchoverTestCase) StopNodes() {
   224  	nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes))
   225  	for _, node := range tc.nodes {
   226  		nodes = append(nodes, node)
   227  	}
   228  	unittest.RequireCloseBefore(tc.T(), util.AllDone(nodes...), time.Second, "could not stop nodes")
   229  }
   230  
   231  func (tc *ClusterSwitchoverTestCase) RootBlock() *flow.Header {
   232  	head, err := tc.root.Head()
   233  	require.NoError(tc.T(), err)
   234  	return head
   235  }
   236  
   237  func (tc *ClusterSwitchoverTestCase) ServiceAddress() flow.Address {
   238  	return tc.RootBlock().ChainID.Chain().ServiceAddress()
   239  }
   240  
   241  // Transaction returns a transaction which is valid for ingestion by a
   242  // collection node in this test suite.
   243  func (tc *ClusterSwitchoverTestCase) Transaction(opts ...func(*flow.TransactionBody)) *flow.TransactionBody {
   244  	tx := flow.NewTransactionBody().
   245  		AddAuthorizer(tc.ServiceAddress()).
   246  		SetPayer(tc.ServiceAddress()).
   247  		SetScript(unittest.NoopTxScript()).
   248  		SetReferenceBlockID(tc.RootBlock().ID())
   249  
   250  	for _, apply := range opts {
   251  		apply(tx)
   252  	}
   253  
   254  	return tx
   255  }
   256  
   257  // ExpectTransaction asserts that the test case expects the given transaction
   258  // to be included in the given cluster state for the given epoch.
   259  func (tc *ClusterSwitchoverTestCase) ExpectTransaction(epochCounter uint64, clusterIndex uint, txID flow.Identifier) {
   260  	if _, ok := tc.sentTransactions[epochCounter]; !ok {
   261  		tc.sentTransactions[epochCounter] = make(map[uint]flow.IdentifierList)
   262  	}
   263  	tc.T().Logf("expecting transaction %x in epoch %d for cluster %d", txID, epochCounter, clusterIndex)
   264  	expected := tc.sentTransactions[epochCounter][clusterIndex]
   265  	expected = append(expected, txID)
   266  	tc.sentTransactions[epochCounter][clusterIndex] = expected
   267  }
   268  
   269  // ClusterState opens and returns a read-only cluster state for the given node and cluster ID.
   270  func (tc *ClusterSwitchoverTestCase) ClusterState(node testmock.CollectionNode, clusterID flow.ChainID, epoch uint64) cluster.State {
   271  	state, err := bcluster.OpenState(node.PublicDB, node.Tracer, node.Headers, node.ClusterPayloads, clusterID, epoch)
   272  	require.NoError(tc.T(), err)
   273  	return state
   274  }
   275  
   276  // State returns the protocol state.
   277  func (tc *ClusterSwitchoverTestCase) State() protocol.State {
   278  	return tc.nodes[0].State
   279  }
   280  
   281  // Collector returns the mock node for the collector with the given ID.
   282  func (tc *ClusterSwitchoverTestCase) Collector(id flow.Identifier) testmock.CollectionNode {
   283  	for _, node := range tc.nodes {
   284  		if node.Me.NodeID() == id {
   285  			return node
   286  		}
   287  	}
   288  	tc.T().FailNow()
   289  	return testmock.CollectionNode{}
   290  }
   291  
   292  // Clusters returns the clusters for the current epoch.
   293  func (tc *ClusterSwitchoverTestCase) Clusters(epoch protocol.Epoch) []protocol.Cluster {
   294  	clustering, err := epoch.Clustering()
   295  	require.NoError(tc.T(), err)
   296  
   297  	clusters := make([]protocol.Cluster, 0, len(clustering))
   298  	for i := uint(0); i < uint(len(clustering)); i++ {
   299  		cluster, err := epoch.Cluster(i)
   300  		require.NoError(tc.T(), err)
   301  		clusters = append(clusters, cluster)
   302  	}
   303  
   304  	return clusters
   305  }
   306  
   307  // BlockInEpoch returns the highest block that exists within the bounds of the
   308  // epoch with the given epoch counter.
   309  func (tc *ClusterSwitchoverTestCase) BlockInEpoch(epochCounter uint64) *flow.Header {
   310  	root := tc.RootBlock()
   311  
   312  	for height := root.Height; ; height++ {
   313  		curr := tc.State().AtHeight(height)
   314  		next := tc.State().AtHeight(height + 1)
   315  		curCounter, err := curr.Epochs().Current().Counter()
   316  		require.NoError(tc.T(), err)
   317  		nextCounter, err := next.Epochs().Current().Counter()
   318  		// if we reach a point where the next block doesn't exist, but the
   319  		// current block has the correct counter, return the current block
   320  		if err != nil && curCounter == epochCounter {
   321  			head, err := curr.Head()
   322  			require.NoError(tc.T(), err)
   323  			return head
   324  		}
   325  
   326  		// otherwise, wait until we reach the block where the next block is in
   327  		// the next epoch - this is the highest block in the requested epoch
   328  		if curCounter == epochCounter && nextCounter == epochCounter+1 {
   329  			head, err := curr.Head()
   330  			require.NoError(tc.T(), err)
   331  			return head
   332  		}
   333  	}
   334  }
   335  
   336  // SubmitTransactionToCluster submits a transaction to the given cluster in
   337  // the given epoch and marks the transaction as expected for inclusion in
   338  // the corresponding cluster state.
   339  func (tc *ClusterSwitchoverTestCase) SubmitTransactionToCluster(
   340  	epochCounter uint64, // the epoch we are submitting the transacting w.r.t.
   341  	clustering flow.ClusterList, // the clustering for the epoch
   342  	clusterIndex uint, // the index of the cluster we are targetting
   343  ) {
   344  
   345  	clusterMembers := clustering[int(clusterIndex)]
   346  	// get any block within the target epoch as the transaction's reference block
   347  	refBlock := tc.BlockInEpoch(epochCounter)
   348  	tx := tc.Transaction(func(tx *flow.TransactionBody) {
   349  		tx.SetReferenceBlockID(refBlock.ID())
   350  	})
   351  	clusterTx := unittest.AlterTransactionForCluster(*tx, clustering, clusterMembers, nil)
   352  	tc.ExpectTransaction(epochCounter, clusterIndex, clusterTx.ID())
   353  
   354  	// submit the transaction to any collector in this cluster
   355  	err := tc.Collector(clusterMembers[0].NodeID).IngestionEngine.ProcessTransaction(&clusterTx)
   356  	require.NoError(tc.T(), err)
   357  }
   358  
   359  // CheckClusterState checks the cluster state of the given node (within the given
   360  // cluster) and asserts that only transaction specified by ExpectTransaction are
   361  // included.
   362  func (tc *ClusterSwitchoverTestCase) CheckClusterState(
   363  	identity *flow.Identity,
   364  	clusterInfo protocol.Cluster,
   365  ) {
   366  	node := tc.Collector(identity.NodeID)
   367  	state := tc.ClusterState(node, clusterInfo.ChainID(), clusterInfo.EpochCounter())
   368  	expected := tc.sentTransactions[clusterInfo.EpochCounter()][clusterInfo.Index()]
   369  	unittest.NewClusterStateChecker(state).
   370  		ExpectTxCount(len(expected)).
   371  		ExpectContainsTx(expected...).
   372  		Assert(tc.T())
   373  }
   374  
   375  // Timeout returns the timeout for async tasks for this test case.
   376  func (tc *ClusterSwitchoverTestCase) Timeout() time.Duration {
   377  	// 60s + 10s for each collector
   378  	// locally the whole suite takes
   379  	// * ~8s when run alone
   380  	// * ~15-20s when run in parallel with other packages (default)
   381  	return 60*time.Second + 10*time.Second*time.Duration(tc.conf.collectors)
   382  }
   383  
   384  // RunTestCase comprises the core test logic for cluster switchover. We build
   385  // an epoch, which triggers the beginning of the epoch 2 cluster consensus, then
   386  // send transactions targeting clusters from both epochs while both are running.
   387  func RunTestCase(tc *ClusterSwitchoverTestCase) {
   388  
   389  	tc.StartNodes()
   390  	defer tc.StopNodes()
   391  
   392  	// keep track of guarantees received at the mock consensus node
   393  	// when a guarantee is received, it indicates that the sender has finalized
   394  	// the corresponding cluster block
   395  	expectedGuaranteesPerEpoch := int(tc.conf.collectors)
   396  	waitForGuarantees := new(sync.WaitGroup)
   397  	waitForGuarantees.Add(expectedGuaranteesPerEpoch)
   398  	tc.sn.On("Process", mock.Anything, mock.Anything, mock.Anything).
   399  		Return(nil).
   400  		Run(func(args mock.Arguments) {
   401  			id, ok := args[1].(flow.Identifier)
   402  			require.True(tc.T(), ok)
   403  			_, ok = args[2].(*flow.CollectionGuarantee)
   404  			tc.T().Log("got guarantee from", id.String())
   405  			require.True(tc.T(), ok)
   406  			waitForGuarantees.Done()
   407  		}).
   408  		Times(expectedGuaranteesPerEpoch * 2)
   409  
   410  	// build the epoch, ending on the first block on the next epoch
   411  	tc.builder.BuildEpoch().CompleteEpoch()
   412  	// build halfway through the grace period for the epoch 1 cluster
   413  	tc.builder.BuildBlocks(flow.DefaultTransactionExpiry / 2)
   414  
   415  	epoch1 := tc.State().Final().Epochs().Previous()
   416  	epoch2 := tc.State().Final().Epochs().Current()
   417  
   418  	epoch1Clusters := tc.Clusters(epoch1)
   419  	epoch2Clusters := tc.Clusters(epoch2)
   420  	epoch1Clustering, err := epoch1.Clustering()
   421  	require.NoError(tc.T(), err)
   422  	epoch2Clustering, err := epoch2.Clustering()
   423  	require.NoError(tc.T(), err)
   424  
   425  	// submit transactions targeting epoch 1 clusters
   426  	for clusterIndex := range epoch1Clustering {
   427  		tc.SubmitTransactionToCluster(1, epoch1Clustering, uint(clusterIndex))
   428  	}
   429  
   430  	// wait for epoch 1 transactions to be guaranteed
   431  	unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node")
   432  
   433  	// submit transactions targeting epoch 2 clusters
   434  	for clusterIndex := range epoch2Clustering {
   435  		tc.SubmitTransactionToCluster(2, epoch2Clustering, uint(clusterIndex))
   436  	}
   437  
   438  	waitForGuarantees.Add(expectedGuaranteesPerEpoch)
   439  
   440  	// build enough blocks to terminate the epoch 1 cluster consensus
   441  	// NOTE: this is here solely to improve test reliability, as it means that
   442  	// while we are waiting for a guarantee there is only one cluster consensus
   443  	// instance running (per node) rather than two.
   444  	tc.builder.BuildBlocks(flow.DefaultTransactionExpiry/2 + 1)
   445  
   446  	// wait for epoch 2 transactions to be guaranteed
   447  	unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node")
   448  
   449  	// check epoch 1 cluster states
   450  	for _, clusterInfo := range epoch1Clusters {
   451  		for _, member := range clusterInfo.Members() {
   452  			tc.CheckClusterState(member, clusterInfo)
   453  		}
   454  	}
   455  
   456  	// check epoch 2 cluster states
   457  	for _, clusterInfo := range epoch2Clusters {
   458  		for _, member := range clusterInfo.Members() {
   459  			tc.CheckClusterState(member, clusterInfo)
   460  		}
   461  	}
   462  }