github.com/koko1123/flow-go-1@v0.29.6/engine/collection/test/cluster_switchover_test.go (about)

     1  package test
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/stretchr/testify/mock"
    10  	"github.com/stretchr/testify/require"
    11  
    12  	"github.com/koko1123/flow-go-1/cmd/bootstrap/run"
    13  	"github.com/koko1123/flow-go-1/engine/testutil"
    14  	testmock "github.com/koko1123/flow-go-1/engine/testutil/mock"
    15  	model "github.com/koko1123/flow-go-1/model/bootstrap"
    16  	"github.com/koko1123/flow-go-1/model/flow"
    17  	"github.com/koko1123/flow-go-1/model/flow/factory"
    18  	"github.com/koko1123/flow-go-1/model/flow/filter"
    19  	"github.com/koko1123/flow-go-1/module"
    20  	"github.com/koko1123/flow-go-1/module/irrecoverable"
    21  	"github.com/koko1123/flow-go-1/module/util"
    22  	"github.com/koko1123/flow-go-1/network/channels"
    23  	"github.com/koko1123/flow-go-1/network/mocknetwork"
    24  	"github.com/koko1123/flow-go-1/network/stub"
    25  	"github.com/koko1123/flow-go-1/state/cluster"
    26  	bcluster "github.com/koko1123/flow-go-1/state/cluster/badger"
    27  	"github.com/koko1123/flow-go-1/state/protocol"
    28  	"github.com/koko1123/flow-go-1/state/protocol/inmem"
    29  	"github.com/koko1123/flow-go-1/utils/unittest"
    30  )
    31  
    32  // ClusterSwitchoverTestCase comprises one test case of the cluster switchover.
    33  // Collection nodes are assigned to one cluster each epoch. On epoch
    34  // boundaries they must gracefully terminate cluster consensus for the ending
    35  // epoch and begin cluster consensus the beginning epoch. These two consensus
    36  // committees co-exist for a short period at the beginning of each epoch.
    37  type ClusterSwitchoverTestCase struct {
    38  	t    *testing.T
    39  	conf ClusterSwitchoverTestConf
    40  
    41  	identities flow.IdentityList         // identity table
    42  	hub        *stub.Hub                 // mock network hub
    43  	root       protocol.Snapshot         // shared root snapshot
    44  	nodes      []testmock.CollectionNode // collection nodes
    45  	sn         *mocknetwork.Engine       // fake consensus node engine for receiving guarantees
    46  	builder    *unittest.EpochBuilder    // utility for building epochs
    47  
    48  	// epoch counter -> cluster index -> transaction IDs
    49  	sentTransactions map[uint64]map[uint]flow.IdentifierList // track submitted transactions
    50  }
    51  
    52  // NewClusterSwitchoverTestCase constructs a new cluster switchover test case
    53  // given the configuration, creating all dependencies and mock nodes.
    54  func NewClusterSwitchoverTestCase(t *testing.T, conf ClusterSwitchoverTestConf) *ClusterSwitchoverTestCase {
    55  
    56  	tc := &ClusterSwitchoverTestCase{
    57  		t:    t,
    58  		conf: conf,
    59  	}
    60  
    61  	nodeInfos := unittest.PrivateNodeInfosFixture(int(conf.collectors), unittest.WithRole(flow.RoleCollection))
    62  	collectors := model.ToIdentityList(nodeInfos)
    63  	tc.identities = unittest.CompleteIdentitySet(collectors...)
    64  	assignment := unittest.ClusterAssignment(tc.conf.clusters, collectors)
    65  	clusters, err := factory.NewClusterList(assignment, collectors)
    66  	require.NoError(t, err)
    67  	rootClusterBlocks := run.GenerateRootClusterBlocks(1, clusters)
    68  	rootClusterQCs := make([]flow.ClusterQCVoteData, len(rootClusterBlocks))
    69  	for i, cluster := range clusters {
    70  		signers := make([]model.NodeInfo, 0)
    71  		signerIDs := make([]flow.Identifier, 0)
    72  		for _, identity := range nodeInfos {
    73  			if _, inCluster := cluster.ByNodeID(identity.NodeID); inCluster {
    74  				signers = append(signers, identity)
    75  				signerIDs = append(signerIDs, identity.NodeID)
    76  			}
    77  		}
    78  		qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlocks[i])
    79  		require.NoError(t, err)
    80  		rootClusterQCs[i] = flow.ClusterQCVoteDataFromQC(&flow.QuorumCertificateWithSignerIDs{
    81  			View:      qc.View,
    82  			BlockID:   qc.BlockID,
    83  			SignerIDs: signerIDs,
    84  			SigData:   qc.SigData,
    85  		})
    86  	}
    87  
    88  	tc.sentTransactions = make(map[uint64]map[uint]flow.IdentifierList)
    89  	tc.hub = stub.NewNetworkHub()
    90  
    91  	// create a root snapshot with the given number of initial clusters
    92  	root, result, seal := unittest.BootstrapFixture(tc.identities)
    93  	qc := unittest.QuorumCertificateFixture(unittest.QCWithBlockID(root.ID()))
    94  	setup := result.ServiceEvents[0].Event.(*flow.EpochSetup)
    95  	commit := result.ServiceEvents[1].Event.(*flow.EpochCommit)
    96  
    97  	setup.Assignments = unittest.ClusterAssignment(tc.conf.clusters, tc.identities)
    98  	commit.ClusterQCs = rootClusterQCs
    99  
   100  	seal.ResultID = result.ID()
   101  	tc.root, err = inmem.SnapshotFromBootstrapState(root, result, seal, qc)
   102  	require.NoError(t, err)
   103  
   104  	cancelCtx, cancel := context.WithCancel(context.Background())
   105  	defer cancel()
   106  	ctx := irrecoverable.NewMockSignalerContext(t, cancelCtx)
   107  	defer cancel()
   108  
   109  	// create a mock node for each collector identity
   110  	for _, collector := range nodeInfos {
   111  		node := testutil.CollectionNode(tc.T(), ctx, tc.hub, collector, tc.root)
   112  		tc.nodes = append(tc.nodes, node)
   113  	}
   114  
   115  	// create a mock consensus node to receive collection guarantees
   116  	consensus := testutil.GenericNode(
   117  		tc.T(),
   118  		tc.hub,
   119  		tc.identities.Filter(filter.HasRole(flow.RoleConsensus))[0],
   120  		tc.root,
   121  	)
   122  	tc.sn = new(mocknetwork.Engine)
   123  	_, err = consensus.Net.Register(channels.ReceiveGuarantees, tc.sn)
   124  	require.NoError(tc.T(), err)
   125  
   126  	// create an epoch builder hooked to each collector's protocol state
   127  	states := make([]protocol.MutableState, 0, len(collectors))
   128  	for _, node := range tc.nodes {
   129  		states = append(states, node.State)
   130  	}
   131  	// when building new epoch we would like to replace fixture cluster QCs with real ones, for that we need
   132  	// to generate them using node infos
   133  	tc.builder = unittest.NewEpochBuilder(tc.T(), states...).UsingCommitOpts(func(commit *flow.EpochCommit) {
   134  		// build a lookup table for node infos
   135  		nodeInfoLookup := make(map[flow.Identifier]model.NodeInfo)
   136  		for _, nodeInfo := range nodeInfos {
   137  			nodeInfoLookup[nodeInfo.NodeID] = nodeInfo
   138  		}
   139  
   140  		// replace cluster QCs, with real data
   141  		for i, clusterQC := range commit.ClusterQCs {
   142  			clusterParticipants := flow.IdentifierList(clusterQC.VoterIDs).Lookup()
   143  			signers := make([]model.NodeInfo, 0, len(clusterParticipants))
   144  			for _, signerID := range clusterQC.VoterIDs {
   145  				signer := nodeInfoLookup[signerID]
   146  				signers = append(signers, signer)
   147  			}
   148  
   149  			// generate root cluster block
   150  			rootClusterBlock := cluster.CanonicalRootBlock(commit.Counter, model.ToIdentityList(signers))
   151  			// generate cluster root qc
   152  			qc, err := run.GenerateClusterRootQC(signers, model.ToIdentityList(signers), rootClusterBlock)
   153  			require.NoError(t, err)
   154  			signerIDs := toSignerIDs(signers)
   155  			qcWithSignerIDs := &flow.QuorumCertificateWithSignerIDs{
   156  				View:      qc.View,
   157  				BlockID:   qc.BlockID,
   158  				SignerIDs: signerIDs,
   159  				SigData:   qc.SigData,
   160  			}
   161  			commit.ClusterQCs[i] = flow.ClusterQCVoteDataFromQC(qcWithSignerIDs)
   162  		}
   163  	})
   164  
   165  	return tc
   166  }
   167  
   168  func toSignerIDs(signers []model.NodeInfo) []flow.Identifier {
   169  	signerIDs := make([]flow.Identifier, 0, len(signers))
   170  	for _, signer := range signers {
   171  		signerIDs = append(signerIDs, signer.NodeID)
   172  	}
   173  	return signerIDs
   174  }
   175  
   176  // TestClusterSwitchover_Simple is the simplest switchover case with one single-node cluster.
   177  func TestClusterSwitchover_Simple(t *testing.T) {
   178  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   179  		clusters:   1,
   180  		collectors: 1,
   181  	}))
   182  }
   183  
   184  // TestClusterSwitchover_MultiCollectorCluster tests switchover with a cluster
   185  // containing more than one collector.
   186  func TestClusterSwitchover_MultiCollectorCluster(t *testing.T) {
   187  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   188  		clusters:   1,
   189  		collectors: 2,
   190  	}))
   191  }
   192  
   193  // TestClusterSwitchover_MultiCluster tests cluster switchover with two clusters.
   194  func TestClusterSwitchover_MultiCluster(t *testing.T) {
   195  	RunTestCase(NewClusterSwitchoverTestCase(t, ClusterSwitchoverTestConf{
   196  		clusters:   2,
   197  		collectors: 2,
   198  	}))
   199  }
   200  
   201  // ClusterSwitchoverTestConf configures a test case.
   202  type ClusterSwitchoverTestConf struct {
   203  	clusters   uint // # of clusters each epoch
   204  	collectors uint // # of collectors each epoch
   205  }
   206  
   207  func (tc *ClusterSwitchoverTestCase) T() *testing.T {
   208  	return tc.t
   209  }
   210  
   211  // StartNodes starts all collection nodes in the suite and turns on continuous
   212  // delivery in the stub network.
   213  func (tc *ClusterSwitchoverTestCase) StartNodes() {
   214  
   215  	// start all node components
   216  	nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes))
   217  	for _, node := range tc.nodes {
   218  		nodes = append(nodes, node)
   219  	}
   220  
   221  	unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), time.Second, "could not start nodes")
   222  
   223  	// start continuous delivery for all nodes
   224  	for _, node := range tc.nodes {
   225  		node.Net.StartConDev(10*time.Millisecond, false)
   226  	}
   227  }
   228  
   229  func (tc *ClusterSwitchoverTestCase) StopNodes() {
   230  	nodes := make([]module.ReadyDoneAware, 0, len(tc.nodes))
   231  	for _, node := range tc.nodes {
   232  		nodes = append(nodes, node)
   233  	}
   234  	unittest.RequireCloseBefore(tc.T(), util.AllDone(nodes...), time.Second, "could not stop nodes")
   235  }
   236  
   237  func (tc *ClusterSwitchoverTestCase) RootBlock() *flow.Header {
   238  	head, err := tc.root.Head()
   239  	require.NoError(tc.T(), err)
   240  	return head
   241  }
   242  
   243  func (tc *ClusterSwitchoverTestCase) ServiceAddress() flow.Address {
   244  	return tc.RootBlock().ChainID.Chain().ServiceAddress()
   245  }
   246  
   247  // Transaction returns a transaction which is valid for ingestion by a
   248  // collection node in this test suite.
   249  func (tc *ClusterSwitchoverTestCase) Transaction(opts ...func(*flow.TransactionBody)) *flow.TransactionBody {
   250  	tx := flow.NewTransactionBody().
   251  		AddAuthorizer(tc.ServiceAddress()).
   252  		SetPayer(tc.ServiceAddress()).
   253  		SetScript(unittest.NoopTxScript()).
   254  		SetReferenceBlockID(tc.RootBlock().ID())
   255  
   256  	for _, apply := range opts {
   257  		apply(tx)
   258  	}
   259  
   260  	return tx
   261  }
   262  
   263  // ExpectTransaction asserts that the test case expects the given transaction
   264  // to be included in the given cluster state for the given epoch.
   265  func (tc *ClusterSwitchoverTestCase) ExpectTransaction(epochCounter uint64, clusterIndex uint, txID flow.Identifier) {
   266  	if _, ok := tc.sentTransactions[epochCounter]; !ok {
   267  		tc.sentTransactions[epochCounter] = make(map[uint]flow.IdentifierList)
   268  	}
   269  	tc.T().Logf("expecting transaction %x in epoch %d for cluster %d", txID, epochCounter, clusterIndex)
   270  	expected := tc.sentTransactions[epochCounter][clusterIndex]
   271  	expected = append(expected, txID)
   272  	tc.sentTransactions[epochCounter][clusterIndex] = expected
   273  }
   274  
   275  // ClusterState opens and returns a read-only cluster state for the given node and cluster ID.
   276  func (tc *ClusterSwitchoverTestCase) ClusterState(node testmock.CollectionNode, clusterID flow.ChainID) cluster.State {
   277  	state, err := bcluster.OpenState(node.PublicDB, node.Tracer, node.Headers, node.ClusterPayloads, clusterID)
   278  	require.NoError(tc.T(), err)
   279  	return state
   280  }
   281  
   282  // State returns the protocol state.
   283  func (tc *ClusterSwitchoverTestCase) State() protocol.State {
   284  	return tc.nodes[0].State
   285  }
   286  
   287  // Collector returns the mock node for the collector with the given ID.
   288  func (tc *ClusterSwitchoverTestCase) Collector(id flow.Identifier) testmock.CollectionNode {
   289  	for _, node := range tc.nodes {
   290  		if node.Me.NodeID() == id {
   291  			return node
   292  		}
   293  	}
   294  	tc.T().FailNow()
   295  	return testmock.CollectionNode{}
   296  }
   297  
   298  // Clusters returns the clusters for the current epoch.
   299  func (tc *ClusterSwitchoverTestCase) Clusters(epoch protocol.Epoch) []protocol.Cluster {
   300  	clustering, err := epoch.Clustering()
   301  	require.NoError(tc.T(), err)
   302  
   303  	clusters := make([]protocol.Cluster, 0, len(clustering))
   304  	for i := uint(0); i < uint(len(clustering)); i++ {
   305  		cluster, err := epoch.Cluster(i)
   306  		require.NoError(tc.T(), err)
   307  		clusters = append(clusters, cluster)
   308  	}
   309  
   310  	return clusters
   311  }
   312  
   313  // BlockInEpoch returns the highest block that exists within the bounds of the
   314  // epoch with the given epoch counter.
   315  func (tc *ClusterSwitchoverTestCase) BlockInEpoch(epochCounter uint64) *flow.Header {
   316  	root := tc.RootBlock()
   317  
   318  	for height := root.Height; ; height++ {
   319  		curr := tc.State().AtHeight(height)
   320  		next := tc.State().AtHeight(height + 1)
   321  		curCounter, err := curr.Epochs().Current().Counter()
   322  		require.NoError(tc.T(), err)
   323  		nextCounter, err := next.Epochs().Current().Counter()
   324  		// if we reach a point where the next block doesn't exist, but the
   325  		// current block has the correct counter, return the current block
   326  		if err != nil && curCounter == epochCounter {
   327  			head, err := curr.Head()
   328  			require.NoError(tc.T(), err)
   329  			return head
   330  		}
   331  
   332  		// otherwise, wait until we reach the block where the next block is in
   333  		// the next epoch - this is the highest block in the requested epoch
   334  		if curCounter == epochCounter && nextCounter == epochCounter+1 {
   335  			head, err := curr.Head()
   336  			require.NoError(tc.T(), err)
   337  			return head
   338  		}
   339  	}
   340  }
   341  
   342  // SubmitTransactionToCluster submits a transaction to the given cluster in
   343  // the given epoch and marks the transaction as expected for inclusion in
   344  // the corresponding cluster state.
   345  func (tc *ClusterSwitchoverTestCase) SubmitTransactionToCluster(
   346  	epochCounter uint64,         // the epoch we are submitting the transacting w.r.t.
   347  	clustering flow.ClusterList, // the clustering for the epoch
   348  	clusterIndex uint,           // the index of the cluster we are targetting
   349  ) {
   350  
   351  	clusterMembers := clustering[int(clusterIndex)]
   352  	// get any block within the target epoch as the transaction's reference block
   353  	refBlock := tc.BlockInEpoch(epochCounter)
   354  	tx := tc.Transaction(func(tx *flow.TransactionBody) {
   355  		tx.SetReferenceBlockID(refBlock.ID())
   356  	})
   357  	clusterTx := unittest.AlterTransactionForCluster(*tx, clustering, clusterMembers, nil)
   358  	tc.ExpectTransaction(epochCounter, clusterIndex, clusterTx.ID())
   359  
   360  	// submit the transaction to any collector in this cluster
   361  	err := tc.Collector(clusterMembers[0].NodeID).IngestionEngine.ProcessTransaction(&clusterTx)
   362  	require.NoError(tc.T(), err)
   363  }
   364  
   365  // CheckClusterState checks the cluster state of the given node (within the given
   366  // cluster) and asserts that only transaction specified by ExpectTransaction are
   367  // included.
   368  func (tc *ClusterSwitchoverTestCase) CheckClusterState(
   369  	identity *flow.Identity,
   370  	clusterInfo protocol.Cluster,
   371  ) {
   372  	node := tc.Collector(identity.NodeID)
   373  	state := tc.ClusterState(node, clusterInfo.ChainID())
   374  	expected := tc.sentTransactions[clusterInfo.EpochCounter()][clusterInfo.Index()]
   375  	unittest.NewClusterStateChecker(state).
   376  		ExpectTxCount(len(expected)).
   377  		ExpectContainsTx(expected...).
   378  		Assert(tc.T())
   379  }
   380  
   381  // Timeout returns the timeout for async tasks for this test case.
   382  func (tc *ClusterSwitchoverTestCase) Timeout() time.Duration {
   383  	// 60s + 10s for each collector
   384  	// locally the whole suite takes
   385  	// * ~8s when run alone
   386  	// * ~15-20s when run in parallel with other packages (default)
   387  	return 60*time.Second + 10*time.Second*time.Duration(tc.conf.collectors)
   388  }
   389  
   390  // RunTestCase comprises the core test logic for cluster switchover. We build
   391  // an epoch, which triggers the beginning of the epoch 2 cluster consensus, then
   392  // send transactions targeting clusters from both epochs while both are running.
   393  func RunTestCase(tc *ClusterSwitchoverTestCase) {
   394  
   395  	tc.StartNodes()
   396  	defer tc.StopNodes()
   397  
   398  	// keep track of guarantees received at the mock consensus node
   399  	// when a guarantee is received, it indicates that the sender has finalized
   400  	// the corresponding cluster block
   401  	expectedGuaranteesPerEpoch := int(tc.conf.collectors)
   402  	waitForGuarantees := new(sync.WaitGroup)
   403  	waitForGuarantees.Add(expectedGuaranteesPerEpoch)
   404  	tc.sn.On("Process", mock.Anything, mock.Anything, mock.Anything).
   405  		Return(nil).
   406  		Run(func(args mock.Arguments) {
   407  			id, ok := args[1].(flow.Identifier)
   408  			require.True(tc.T(), ok)
   409  			_, ok = args[2].(*flow.CollectionGuarantee)
   410  			tc.T().Log("got guarantee from", id.String())
   411  			require.True(tc.T(), ok)
   412  			waitForGuarantees.Done()
   413  		}).
   414  		Times(expectedGuaranteesPerEpoch * 2)
   415  
   416  	// build the epoch, ending on the first block on the next epoch
   417  	tc.builder.BuildEpoch().CompleteEpoch()
   418  	// build halfway through the grace period for the epoch 1 cluster
   419  	tc.builder.BuildBlocks(flow.DefaultTransactionExpiry / 2)
   420  
   421  	epoch1 := tc.State().Final().Epochs().Previous()
   422  	epoch2 := tc.State().Final().Epochs().Current()
   423  
   424  	epoch1Clusters := tc.Clusters(epoch1)
   425  	epoch2Clusters := tc.Clusters(epoch2)
   426  	epoch1Clustering, err := epoch1.Clustering()
   427  	require.NoError(tc.T(), err)
   428  	epoch2Clustering, err := epoch2.Clustering()
   429  	require.NoError(tc.T(), err)
   430  
   431  	// submit transactions targeting epoch 1 clusters
   432  	for clusterIndex := range epoch1Clustering {
   433  		tc.SubmitTransactionToCluster(1, epoch1Clustering, uint(clusterIndex))
   434  	}
   435  
   436  	// wait for epoch 1 transactions to be guaranteed
   437  	unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node")
   438  
   439  	// submit transactions targeting epoch 2 clusters
   440  	for clusterIndex := range epoch2Clustering {
   441  		tc.SubmitTransactionToCluster(2, epoch2Clustering, uint(clusterIndex))
   442  	}
   443  
   444  	waitForGuarantees.Add(expectedGuaranteesPerEpoch)
   445  
   446  	// build enough blocks to terminate the epoch 1 cluster consensus
   447  	// NOTE: this is here solely to improve test reliability, as it means that
   448  	// while we are waiting for a guarantee there is only one cluster consensus
   449  	// instance running (per node) rather than two.
   450  	tc.builder.BuildBlocks(flow.DefaultTransactionExpiry/2 + 1)
   451  
   452  	// wait for epoch 2 transactions to be guaranteed
   453  	unittest.RequireReturnsBefore(tc.T(), waitForGuarantees.Wait, tc.Timeout(), "did not receive guarantees at consensus node")
   454  
   455  	// check epoch 1 cluster states
   456  	for _, clusterInfo := range epoch1Clusters {
   457  		for _, member := range clusterInfo.Members() {
   458  			tc.CheckClusterState(member, clusterInfo)
   459  		}
   460  	}
   461  
   462  	// check epoch 2 cluster states
   463  	for _, clusterInfo := range epoch2Clusters {
   464  		for _, member := range clusterInfo.Members() {
   465  			tc.CheckClusterState(member, clusterInfo)
   466  		}
   467  	}
   468  }