github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/collection/test/cluster_switchover_test.go (about)

     1  package test
     2  
     3  import (
     4  	"sync"
     5  	"testing"
     6  	"time"
     7  
     8  	"github.com/stretchr/testify/mock"
     9  	"github.com/stretchr/testify/require"
    10  
    11  	"github.com/onflow/flow-go/cmd/bootstrap/run"
    12  	"github.com/onflow/flow-go/engine/testutil"
    13  	testmock "github.com/onflow/flow-go/engine/testutil/mock"
    14  	model "github.com/onflow/flow-go/model/bootstrap"
    15  	"github.com/onflow/flow-go/model/flow"
    16  	"github.com/onflow/flow-go/model/flow/factory"
    17  	"github.com/onflow/flow-go/model/flow/filter"
    18  	"github.com/onflow/flow-go/module"
    19  	"github.com/onflow/flow-go/module/util"
    20  	"github.com/onflow/flow-go/network/channels"
    21  	"github.com/onflow/flow-go/network/mocknetwork"
    22  	"github.com/onflow/flow-go/network/stub"
    23  	"github.com/onflow/flow-go/state/cluster"
    24  	bcluster "github.com/onflow/flow-go/state/cluster/badger"
    25  	"github.com/onflow/flow-go/state/protocol"
    26  	"github.com/onflow/flow-go/state/protocol/inmem"
    27  	"github.com/onflow/flow-go/state/protocol/protocol_state/kvstore"
    28  	protocol_state "github.com/onflow/flow-go/state/protocol/protocol_state/state"
    29  	"github.com/onflow/flow-go/utils/unittest"
    30  )
    31  
    32  // ClusterSwitchoverTestCase comprises one test case of the cluster switchover.
    33  // Collection nodes are assigned to one cluster each epoch. On epoch
    34  // boundaries they must gracefully terminate cluster consensus for the ending
    35  // epoch and begin cluster consensus the beginning epoch. These two consensus
    36  // committees co-exist for a short period at the beginning of each epoch.
    37  type ClusterSwitchoverTestCase struct {
    38  	t    *testing.T
    39  	conf ClusterSwitchoverTestConf
    40  
    41  	nodeInfos []model.NodeInfo          // identity table
    42  	hub       *stub.Hub                 // mock network hub
    43  	root      protocol.Snapshot         // shared root snapshot
    44  	nodes     []testmock.CollectionNode // collection nodes
    45  	sn        *mocknetwork.Engine       // fake consensus node engine for receiving guarantees
    46  	builder   *unittest.EpochBuilder    // utility for building epochs
    47  
    48  	// epoch counter -> cluster index -> transaction IDs
    49  	sentTransactions map[uint64]map[uint]flow.IdentifierList // track submitted transactions
    50  }
    51  
    52  // NewClusterSwitchoverTestCase constructs a new cluster switchover test case
    53  // given the configuration, creating all dependencies and mock nodes.
    54  func NewClusterSwitchoverTestCase(t *testing.T, conf ClusterSwitchoverTestConf) *ClusterSwitchoverTestCase {
    55  
    56  	tc := &ClusterSwitchoverTestCase{
    57  		t:    t,
    58  		conf: conf,
    59  	}
    60  	tc.nodeInfos = unittest.PrivateNodeInfosFromIdentityList(
    61  		unittest.CompleteIdentitySet(
    62  			unittest.IdentityListFixture(int(conf.collectors), unittest.WithRole(flow.RoleCollection))...),
    63  	)
    64  	identities := model.ToIdentityList(tc.nodeInfos)
    65  	collectors := identities.Filter(filter.HasRole[flow.Identity](flow.RoleCollection)).ToSkeleton()
    66  	assignment := unittest.ClusterAssignment(tc.conf.clusters, collectors)
    67  	clusters, err := factory.NewClusterList(assignment, collectors)
    68  	require.NoError(t, err)
    69  	rootClusterBlocks := run.GenerateRootClusterBlocks(1, clusters)
    70  	rootClusterQCs := make([]flow.ClusterQCVoteData, len(rootClusterBlocks))
    71  	for i, cluster := range clusters {
    72  		signers := make([]model.NodeInfo, 0)
    73  		for _, identity := range tc.nodeInfos {
    74  			if _, inCluster := cluster.ByNodeID(identity.NodeID); inCluster {
    75  				signers = append(signers, identity)
    76  			}
    77  		}
    78  		signerIdentities := model.ToIdentityList(signers).Sort(flow.Canonical[flow.Identity]).ToSkeleton()
    79  		qc, err := run.GenerateClusterRootQC(signers, signerIdentities, rootClusterBlocks[i])
    80  		require.NoError(t, err)
    81  		rootClusterQCs[i] = flow.ClusterQCVoteDataFromQC(&flow.QuorumCertificateWithSignerIDs{
    82  			View:      qc.View,
    83  			BlockID:   qc.BlockID,
    84  			SignerIDs: signerIdentities.NodeIDs(),
    85  			SigData:   qc.SigData,
    86  		})
    87  	}
    88  
    89  	tc.sentTransactions = make(map[uint64]map[uint]flow.IdentifierList)
    90  	tc.hub = stub.NewNetworkHub()
    91  
    92  	// create a root snapshot with the given number of initial clusters
    93  	root, result, seal := unittest.BootstrapFixture(identities)
    94  	qc := unittest.QuorumCertificateFixture(unittest.QCWithRootBlockID(root.ID()))
    95  	setup := result.ServiceEvents[0].Event.(*flow.EpochSetup)
    96  	commit := result.ServiceEvents[1].Event.(*flow.EpochCommit)
    97  
    98  	setup.Assignments = unittest.ClusterAssignment(tc.conf.clusters, identities.ToSkeleton())
    99  	commit.ClusterQCs = rootClusterQCs
   100  
   101  	seal.ResultID = result.ID()
   102  	root.Payload.ProtocolStateID = kvstore.NewDefaultKVStore(
   103  		inmem.ProtocolStateFromEpochServiceEvents(setup, commit).ID()).ID()
   104  	tc.root, err = inmem.SnapshotFromBootstrapState(root, result, seal, qc)
   105  	require.NoError(t, err)
   106  
   107  	// build a lookup table for node infos
   108  	nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo)
   109  	for _, nodeInfo := range tc.nodeInfos {
   110  		nodeInfoLookup[nodeInfo.NodeID] = nodeInfo
   111  	}
   112  
   113  	// create a mock node for each collector identity
   114  	for _, collector := range collectors {
   115  		nodeInfo := nodeInfoLookup[collector.NodeID]
   116  		node := testutil.CollectionNode(tc.T(), tc.hub, nodeInfo, tc.root)
   117  		tc.nodes = append(tc.nodes, node)
   118  	}
   119  
   120  	// create a mock consensus node to receive collection guarantees
   121  	consensus := testutil.GenericNode(
   122  		tc.T(),
   123  		tc.hub,
   124  		nodeInfoLookup[identities.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0].NodeID],
   125  		tc.root,
   126  	)
   127  	tc.sn = new(mocknetwork.Engine)
   128  	_, err = consensus.Net.Register(channels.ReceiveGuarantees, tc.sn)
   129  	require.NoError(tc.T(), err)
   130  
   131  	// create an epoch builder hooked to each collector's protocol state
   132  	states := make([]protocol.FollowerState, 0)
   133  	for _, node := range tc.nodes {
   134  		states = append(states, node.State)
   135  	}
   136  
   137  	// take first collection node and use its storage as data source for stateMutator
   138  	refNode := tc.nodes[0]
   139  	stateMutator := protocol_state.NewMutableProtocolState(
   140  		refNode.EpochProtocolState,
   141  		refNode.ProtocolKVStore,
   142  		refNode.State.Params(),
   143  		refNode.Headers,
   144  		refNode.Results,
   145  		refNode.Setups,
   146  		refNode.EpochCommits,
   147  	)
   148  
   149  	// when building new epoch we would like to replace fixture cluster QCs with real ones, for that we need
   150  	// to generate them using node infos
   151  	tc.builder = unittest.NewEpochBuilder(tc.T(), stateMutator, states...).UsingCommitOpts(func(commit *flow.EpochCommit) {
   152  		// build a lookup table for node infos
   153  		nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo)
   154  		for _, nodeInfo := range tc.nodeInfos {
   155  			nodeInfoLookup[nodeInfo.NodeID] = nodeInfo
   156  		}
   157  
   158  		// replace cluster QCs, with real data
   159  		for i, clusterQC := range commit.ClusterQCs {
   160  			clusterParticipants := flow.IdentifierList(clusterQC.VoterIDs).Lookup()
   161  			signers := make([]model.NodeInfo, 0, len(clusterParticipants))
   162  			for _, signerID := range clusterQC.VoterIDs {
   163  				signer := nodeInfoLookup[signerID]
   164  				signers = append(signers, signer)
   165  			}
   166  
   167  			// generate root cluster block
   168  			rootClusterBlock := cluster.CanonicalRootBlock(commit.Counter, model.ToIdentityList(signers).ToSkeleton())
   169  			// generate cluster root qc
   170  			qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers).ToSkeleton(), rootClusterBlock)
   171  			require.NoError(t, err)
   172  			signerIDs := toSignerIDs(signers)
   173  			qcWithSignerIDs := &flow.QuorumCertificateWithSignerIDs{
   174  				View:      qc.View,
   175  				BlockID:   qc.BlockID,
   176  				SignerIDs: signerIDs,
   177  				SigData:   qc.SigData,
   178  			}
   179  			commit.ClusterQCs[i] = flow.ClusterQCVoteDataFromQC(qcWithSignerIDs)
   180  		}
   181  	})
   182  
   183  	return tc
   184  }
   185  
   186  func toSignerIDs(signers []model.NodeInfo) []flow.Identifier {
   187  	signerIDs := make([]flow.Identifier, 0, len(signers))
   188  	for _, signer := range signers {
   189  		signerIDs = append(signerIDs, signer.NodeID)
   190  	}
   191  	return signerIDs
   192  }
   193  
   194  // TestClusterSwitchover_Simple is the simplest switchover case with one single-node cluster.
   195  func TestClusterSwitchover_Simple(t *testing.T) {
   196  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   197  		clusters:   1,
   198  		collectors: 1,
   199  	}))
   200  }
   201  
   202  // TestClusterSwitchover_MultiCollectorCluster tests switchover with a cluster
   203  // containing more than one collector.
   204  func TestClusterSwitchover_MultiCollectorCluster(t *testing.T) {
   205  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   206  		clusters:   1,
   207  		collectors: 2,
   208  	}))
   209  }
   210  
   211  // TestClusterSwitchover_MultiCluster tests cluster switchover with two clusters.
   212  func TestClusterSwitchover_MultiCluster(t *testing.T) {
   213  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   214  		clusters:   2,
   215  		collectors: 2,
   216  	}))
   217  }
   218  
   219  // ClusterSwitchoverTestConf configures a test case.
   220  type ClusterSwitchoverTestConf struct {
   221  	clusters   uint // # of clusters each epoch
   222  	collectors uint // # of collectors each epoch
   223  }
   224  
   225  func (tc *ClusterSwitchoverTestCase) T() *testing.T {
   226  	return tc.t
   227  }
   228  
   229  // StartNodes starts all collection nodes in the suite and turns on continuous
   230  // delivery in the stub network.
   231  func (tc *ClusterSwitchoverTestCase) StartNodes() {
   232  
   233  	// start all node components
   234  	nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes))
   235  	for _, node := range tc.nodes {
   236  		node.Start(tc.T())
   237  		nodes = append(nodes, node)
   238  	}
   239  
   240  	unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes")
   241  
   242  	// start continuous delivery for all nodes
   243  	for _, node := range tc.nodes {
   244  		node.Net.StartConDev(10*time.Millisecond, false)
   245  	}
   246  }
   247  
   248  func (tc *ClusterSwitchoverTestCase) StopNodes() {
   249  	nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes))
   250  	for _, node := range tc.nodes {
   251  		nodes = append(nodes, node)
   252  	}
   253  	unittest.RequireCloseBefore(tc.T(), util.AllDone(nodes...), time.Second, "could not stop nodes")
   254  }
   255  
   256  func (tc *ClusterSwitchoverTestCase) RootBlock() *flow.Header {
   257  	head, err := tc.root.Head()
   258  	require.NoError(tc.T(), err)
   259  	return head
   260  }
   261  
   262  func (tc *ClusterSwitchoverTestCase) ServiceAddress() flow.Address {
   263  	return tc.RootBlock().ChainID.Chain().ServiceAddress()
   264  }
   265  
   266  // Transaction returns a transaction which is valid for ingestion by a
   267  // collection node in this test suite.
   268  func (tc *ClusterSwitchoverTestCase) Transaction(opts ...func(*flow.TransactionBody)) *flow.TransactionBody {
   269  	tx := flow.NewTransactionBody().
   270  		AddAuthorizer(tc.ServiceAddress()).
   271  		SetPayer(tc.ServiceAddress()).
   272  		SetScript(unittest.NoopTxScript()).
   273  		SetReferenceBlockID(tc.RootBlock().ID())
   274  
   275  	for _, apply := range opts {
   276  		apply(tx)
   277  	}
   278  
   279  	return tx
   280  }
   281  
   282  // ExpectTransaction asserts that the test case expects the given transaction
   283  // to be included in the given cluster state for the given epoch.
   284  func (tc *ClusterSwitchoverTestCase) ExpectTransaction(epochCounter uint64, clusterIndex uint, txID flow.Identifier) {
   285  	if _, ok := tc.sentTransactions[epochCounter]; !ok {
   286  		tc.sentTransactions[epochCounter] = make(map[uint]flow.IdentifierList)
   287  	}
   288  	tc.T().Logf("expecting transaction %x in epoch %d for cluster %d", txID, epochCounter, clusterIndex)
   289  	expected := tc.sentTransactions[epochCounter][clusterIndex]
   290  	expected = append(expected, txID)
   291  	tc.sentTransactions[epochCounter][clusterIndex] = expected
   292  }
   293  
   294  // ClusterState opens and returns a read-only cluster state for the given node and cluster ID.
   295  func (tc *ClusterSwitchoverTestCase) ClusterState(node testmock.CollectionNode, clusterID flow.ChainID, epoch uint64) cluster.State {
   296  	state, err := bcluster.OpenState(node.PublicDB, node.Tracer, node.Headers, node.ClusterPayloads, clusterID, epoch)
   297  	require.NoError(tc.T(), err)
   298  	return state
   299  }
   300  
   301  // State returns the protocol state.
   302  func (tc *ClusterSwitchoverTestCase) State() protocol.State {
   303  	return tc.nodes[0].State
   304  }
   305  
   306  // Collector returns the mock node for the collector with the given ID.
   307  func (tc *ClusterSwitchoverTestCase) Collector(id flow.Identifier) testmock.CollectionNode {
   308  	for _, node := range tc.nodes {
   309  		if node.Me.NodeID() == id {
   310  			return node
   311  		}
   312  	}
   313  	tc.T().FailNow()
   314  	return testmock.CollectionNode{}
   315  }
   316  
   317  // Clusters returns the clusters for the current epoch.
   318  func (tc *ClusterSwitchoverTestCase) Clusters(epoch protocol.Epoch) []protocol.Cluster {
   319  	clustering, err := epoch.Clustering()
   320  	require.NoError(tc.T(), err)
   321  
   322  	clusters := make([]protocol.Cluster, 0, len(clustering))
   323  	for i := uint(0); i < uint(len(clustering)); i++ {
   324  		cluster, err := epoch.Cluster(i)
   325  		require.NoError(tc.T(), err)
   326  		clusters = append(clusters, cluster)
   327  	}
   328  
   329  	return clusters
   330  }
   331  
   332  // BlockInEpoch returns the highest block that exists within the bounds of the
   333  // epoch with the given epoch counter.
   334  func (tc *ClusterSwitchoverTestCase) BlockInEpoch(epochCounter uint64) *flow.Header {
   335  	root := tc.RootBlock()
   336  
   337  	for height := root.Height; ; height++ {
   338  		curr := tc.State().AtHeight(height)
   339  		next := tc.State().AtHeight(height + 1)
   340  		curCounter, err := curr.Epochs().Current().Counter()
   341  		require.NoError(tc.T(), err)
   342  		nextCounter, err := next.Epochs().Current().Counter()
   343  		// if we reach a point where the next block doesn't exist, but the
   344  		// current block has the correct counter, return the current block
   345  		if err != nil && curCounter == epochCounter {
   346  			head, err := curr.Head()
   347  			require.NoError(tc.T(), err)
   348  			return head
   349  		}
   350  
   351  		// otherwise, wait until we reach the block where the next block is in
   352  		// the next epoch - this is the highest block in the requested epoch
   353  		if curCounter == epochCounter && nextCounter == epochCounter+1 {
   354  			head, err := curr.Head()
   355  			require.NoError(tc.T(), err)
   356  			return head
   357  		}
   358  	}
   359  }
   360  
   361  // SubmitTransactionToCluster submits a transaction to the given cluster in
   362  // the given epoch and marks the transaction as expected for inclusion in
   363  // the corresponding cluster state.
   364  func (tc *ClusterSwitchoverTestCase) SubmitTransactionToCluster(
   365  	epochCounter uint64, // the epoch we are submitting the transacting w.r.t.
   366  	clustering flow.ClusterList, // the clustering for the epoch
   367  	clusterIndex uint, // the index of the cluster we are targetting
   368  ) {
   369  
   370  	clusterMembers := clustering[int(clusterIndex)]
   371  	// get any block within the target epoch as the transaction's reference block
   372  	refBlock := tc.BlockInEpoch(epochCounter)
   373  	tx := tc.Transaction(func(tx *flow.TransactionBody) {
   374  		tx.SetReferenceBlockID(refBlock.ID())
   375  	})
   376  	clusterTx := unittest.AlterTransactionForCluster(*tx, clustering, clusterMembers, nil)
   377  	tc.ExpectTransaction(epochCounter, clusterIndex, clusterTx.ID())
   378  
   379  	// submit the transaction to any collector in this cluster
   380  	err := tc.Collector(clusterMembers[0].NodeID).IngestionEngine.ProcessTransaction(&clusterTx)
   381  	require.NoError(tc.T(), err)
   382  }
   383  
   384  // CheckClusterState checks the cluster state of the given node (within the given
   385  // cluster) and asserts that only transaction specified by ExpectTransaction are
   386  // included.
   387  func (tc *ClusterSwitchoverTestCase) CheckClusterState(
   388  	identity *flow.IdentitySkeleton,
   389  	clusterInfo protocol.Cluster,
   390  ) {
   391  	node := tc.Collector(identity.NodeID)
   392  	state := tc.ClusterState(node, clusterInfo.ChainID(), clusterInfo.EpochCounter())
   393  	expected := tc.sentTransactions[clusterInfo.EpochCounter()][clusterInfo.Index()]
   394  	unittest.NewClusterStateChecker(state).
   395  		ExpectTxCount(len(expected)).
   396  		ExpectContainsTx(expected...).
   397  		Assert(tc.T())
   398  }
   399  
   400  // Timeout returns the timeout for async tasks for this test case.
   401  func (tc *ClusterSwitchoverTestCase) Timeout() time.Duration {
   402  	// 60s + 10s for each collector
   403  	// locally the whole suite takes
   404  	// * ~8s when run alone
   405  	// * ~15-20s when run in parallel with other packages (default)
   406  	return 60*time.Second + 10*time.Second*time.Duration(tc.conf.collectors)
   407  }
   408  
   409  // RunTestCase comprises the core test logic for cluster switchover. We build
   410  // an epoch, which triggers the beginning of the epoch 2 cluster consensus, then
   411  // send transactions targeting clusters from both epochs while both are running.
   412  func RunTestCase(tc *ClusterSwitchoverTestCase) {
   413  
   414  	tc.StartNodes()
   415  	defer tc.StopNodes()
   416  
   417  	// keep track of guarantees received at the mock consensus node
   418  	// when a guarantee is received, it indicates that the sender has finalized
   419  	// the corresponding cluster block
   420  	expectedGuaranteesPerEpoch := int(tc.conf.collectors)
   421  	waitForGuarantees := new(sync.WaitGroup)
   422  	waitForGuarantees.Add(expectedGuaranteesPerEpoch)
   423  	tc.sn.On("Process", mock.Anything, mock.Anything, mock.Anything).
   424  		Return(nil).
   425  		Run(func(args mock.Arguments) {
   426  			id, ok := args[1].(flow.Identifier)
   427  			require.True(tc.T(), ok)
   428  			_, ok = args[2].(*flow.CollectionGuarantee)
   429  			tc.T().Log("got guarantee from", id.String())
   430  			require.True(tc.T(), ok)
   431  			waitForGuarantees.Done()
   432  		}).
   433  		Times(expectedGuaranteesPerEpoch * 2)
   434  
   435  	// build the epoch, ending on the first block on the next epoch
   436  	tc.builder.BuildEpoch().CompleteEpoch()
   437  	// build halfway through the grace period for the epoch 1 cluster
   438  	tc.builder.AddBlocksWithSeals(flow.DefaultTransactionExpiry/2, 1)
   439  
   440  	epoch1 := tc.State().Final().Epochs().Previous()
   441  	epoch2 := tc.State().Final().Epochs().Current()
   442  
   443  	epoch1Clusters := tc.Clusters(epoch1)
   444  	epoch2Clusters := tc.Clusters(epoch2)
   445  	epoch1Clustering, err := epoch1.Clustering()
   446  	require.NoError(tc.T(), err)
   447  	epoch2Clustering, err := epoch2.Clustering()
   448  	require.NoError(tc.T(), err)
   449  
   450  	// submit transactions targeting epoch 1 clusters
   451  	for clusterIndex := range epoch1Clustering {
   452  		tc.SubmitTransactionToCluster(1, epoch1Clustering, uint(clusterIndex))
   453  	}
   454  
   455  	// wait for epoch 1 transactions to be guaranteed
   456  	unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node")
   457  
   458  	// submit transactions targeting epoch 2 clusters
   459  	for clusterIndex := range epoch2Clustering {
   460  		tc.SubmitTransactionToCluster(2, epoch2Clustering, uint(clusterIndex))
   461  	}
   462  
   463  	waitForGuarantees.Add(expectedGuaranteesPerEpoch)
   464  
   465  	// build enough blocks to terminate the epoch 1 cluster consensus
   466  	// NOTE: this is here solely to improve test reliability, as it means that
   467  	// while we are waiting for a guarantee there is only one cluster consensus
   468  	// instance running (per node) rather than two.
   469  	tc.builder.AddBlocksWithSeals(flow.DefaultTransactionExpiry/2+1, 1)
   470  
   471  	// wait for epoch 2 transactions to be guaranteed
   472  	unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node")
   473  
   474  	// check epoch 1 cluster states
   475  	for _, clusterInfo := range epoch1Clusters {
   476  		for _, member := range clusterInfo.Members() {
   477  			tc.CheckClusterState(member, clusterInfo)
   478  		}
   479  	}
   480  
   481  	// check epoch 2 cluster states
   482  	for _, clusterInfo := range epoch2Clusters {
   483  		for _, member := range clusterInfo.Members() {
   484  			tc.CheckClusterState(member, clusterInfo)
   485  		}
   486  	}
   487  }