github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/test/cohort2/epochtransition_test.go (about)

     1  package cohort2
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"math/rand"
     7  	"os"
     8  	"reflect"
     9  	"runtime"
    10  	"sync"
    11  	"testing"
    12  	"time"
    13  
    14  	"github.com/ipfs/go-log"
    15  	"github.com/rs/zerolog"
    16  	"github.com/stretchr/testify/assert"
    17  	"github.com/stretchr/testify/mock"
    18  	"github.com/stretchr/testify/require"
    19  	"github.com/stretchr/testify/suite"
    20  
    21  	"github.com/onflow/flow-go/model/flow"
    22  	"github.com/onflow/flow-go/model/flow/filter"
    23  	"github.com/onflow/flow-go/model/libp2p/message"
    24  	"github.com/onflow/flow-go/module/irrecoverable"
    25  	"github.com/onflow/flow-go/network"
    26  	"github.com/onflow/flow-go/network/channels"
    27  	"github.com/onflow/flow-go/network/internal/testutils"
    28  	"github.com/onflow/flow-go/network/p2p"
    29  	"github.com/onflow/flow-go/network/underlay"
    30  	mockprotocol "github.com/onflow/flow-go/state/protocol/mock"
    31  	"github.com/onflow/flow-go/utils/unittest"
    32  )
    33  
    34  // MutableIdentityTableSuite tests that the networking layer responds correctly
    35  // to changes to the identity table. When nodes are added, we should update our
    36  // topology and accept connections from these new nodes. When nodes are removed
    37  // or ejected we should update our topology and restrict connections from these
    38  // nodes.
    39  type MutableIdentityTableSuite struct {
    40  	suite.Suite
    41  	testutils.ConduitWrapper
    42  	testNodes        testNodeList
    43  	removedTestNodes testNodeList // test nodes which might have been removed from the mesh
    44  	state            *mockprotocol.State
    45  	snapshot         *mockprotocol.Snapshot
    46  	logger           zerolog.Logger
    47  	cancels          []context.CancelFunc
    48  }
    49  
    50  // testNode encapsulates the node state which includes its identity, libp2p node, network,
    51  // mesh engine and the id refresher
    52  type testNode struct {
    53  	id         *flow.Identity
    54  	libp2pNode p2p.LibP2PNode
    55  	network    *underlay.Network
    56  	engine     *testutils.MeshEngine
    57  }
    58  
    59  // testNodeList encapsulates a list of test node and
    60  // has functions to retrieve the different elements of the test nodes in a concurrency safe manner
    61  type testNodeList struct {
    62  	sync.RWMutex
    63  	nodes []testNode
    64  }
    65  
    66  func newTestNodeList() testNodeList {
    67  	return testNodeList{}
    68  }
    69  
    70  func (t *testNodeList) append(node testNode) {
    71  	t.Lock()
    72  	defer t.Unlock()
    73  	t.nodes = append(t.nodes, node)
    74  }
    75  
    76  func (t *testNodeList) remove() testNode {
    77  	t.Lock()
    78  	defer t.Unlock()
    79  	// choose a random node to remove
    80  	i := rand.Intn(len(t.nodes))
    81  	removedNode := t.nodes[i]
    82  	t.nodes = append(t.nodes[:i], t.nodes[i+1:]...)
    83  	return removedNode
    84  }
    85  
    86  func (t *testNodeList) ids() flow.IdentityList {
    87  	t.RLock()
    88  	defer t.RUnlock()
    89  	ids := make(flow.IdentityList, len(t.nodes))
    90  	for i, node := range t.nodes {
    91  		ids[i] = node.id
    92  	}
    93  	return ids
    94  }
    95  
    96  func (t *testNodeList) lastAdded() (testNode, error) {
    97  	t.RLock()
    98  	defer t.RUnlock()
    99  	if len(t.nodes) > 0 {
   100  		return t.nodes[len(t.nodes)-1], nil
   101  	}
   102  	return testNode{}, fmt.Errorf("node list empty")
   103  }
   104  
   105  func (t *testNodeList) engines() []*testutils.MeshEngine {
   106  	t.RLock()
   107  	defer t.RUnlock()
   108  	engs := make([]*testutils.MeshEngine, len(t.nodes))
   109  	for i, node := range t.nodes {
   110  		engs[i] = node.engine
   111  	}
   112  	return engs
   113  }
   114  
   115  func (t *testNodeList) networks() []network.EngineRegistry {
   116  	t.RLock()
   117  	defer t.RUnlock()
   118  	nets := make([]network.EngineRegistry, len(t.nodes))
   119  	for i, node := range t.nodes {
   120  		nets[i] = node.network
   121  	}
   122  	return nets
   123  }
   124  
   125  func (t *testNodeList) libp2pNodes() []p2p.LibP2PNode {
   126  	t.RLock()
   127  	defer t.RUnlock()
   128  	nodes := make([]p2p.LibP2PNode, len(t.nodes))
   129  	for i, node := range t.nodes {
   130  		nodes[i] = node.libp2pNode
   131  	}
   132  	return nodes
   133  }
   134  
   135  func TestMutableIdentityTable(t *testing.T) {
   136  	unittest.SkipUnless(t, unittest.TEST_TODO, "broken test")
   137  	suite.Run(t, new(MutableIdentityTableSuite))
   138  }
   139  
   140  // signalIdentityChanged update IDs for all the current set of nodes (simulating an epoch)
   141  func (suite *MutableIdentityTableSuite) signalIdentityChanged() {
   142  	for _, n := range suite.testNodes.nodes {
   143  		n.network.UpdateNodeAddresses()
   144  	}
   145  }
   146  
   147  func (suite *MutableIdentityTableSuite) SetupTest() {
   148  	suite.testNodes = newTestNodeList()
   149  	suite.removedTestNodes = newTestNodeList()
   150  
   151  	nodeCount := 10
   152  	suite.logger = zerolog.New(os.Stderr).Level(zerolog.ErrorLevel)
   153  	log.SetAllLoggers(log.LevelError)
   154  
   155  	suite.setupStateMock()
   156  	suite.addNodes(nodeCount)
   157  
   158  	// simulate a start of an epoch by signaling a change in the identity table
   159  	suite.signalIdentityChanged()
   160  
   161  	// wait for two lip2p heatbeats for the nodes to discover each other and form the mesh
   162  	time.Sleep(2 * time.Second)
   163  }
   164  
   165  // TearDownTest closes all the networks within a specified timeout
   166  func (suite *MutableIdentityTableSuite) TearDownTest() {
   167  	for _, cancel := range suite.cancels {
   168  		cancel()
   169  	}
   170  	networks := append(suite.testNodes.networks(), suite.removedTestNodes.networks()...)
   171  	testutils.StopComponents(suite.T(), networks, 3*time.Second)
   172  }
   173  
   174  // setupStateMock setup state related mocks (all networks share the same state mock)
   175  func (suite *MutableIdentityTableSuite) setupStateMock() {
   176  	final := unittest.BlockHeaderFixture()
   177  	suite.state = new(mockprotocol.State)
   178  	suite.snapshot = new(mockprotocol.Snapshot)
   179  	suite.snapshot.On("Head").Return(&final, nil)
   180  	suite.snapshot.On("Phase").Return(flow.EpochPhaseCommitted, nil)
   181  	// return all the current list of ids for the state.Final.Identities call made by the network
   182  	suite.snapshot.On("Identities", mock.Anything).Return(
   183  		func(flow.IdentityFilter[flow.Identity]) flow.IdentityList {
   184  			return suite.testNodes.ids()
   185  		},
   186  		func(flow.IdentityFilter[flow.Identity]) error { return nil })
   187  	suite.state.On("Final").Return(suite.snapshot, nil)
   188  }
   189  
   190  // addNodes creates count many new nodes and appends them to the suite state variables
   191  func (suite *MutableIdentityTableSuite) addNodes(count int) {
   192  	ctx, cancel := context.WithCancel(context.Background())
   193  	signalerCtx := irrecoverable.NewMockSignalerContext(suite.T(), ctx)
   194  	sporkId := unittest.IdentifierFixture()
   195  	ids, nodes := testutils.LibP2PNodeForNetworkFixture(suite.T(), sporkId, count)
   196  	nets, _ := testutils.NetworksFixture(suite.T(), sporkId, ids, nodes)
   197  	suite.cancels = append(suite.cancels, cancel)
   198  
   199  	// starts the nodes and networks
   200  	testutils.StartNodes(signalerCtx, suite.T(), nodes)
   201  	for _, net := range nets {
   202  		testutils.StartNetworks(signalerCtx, suite.T(), []network.EngineRegistry{net})
   203  		unittest.RequireComponentsReadyBefore(suite.T(), 1*time.Second, net)
   204  	}
   205  
   206  	// create the engines for the new nodes
   207  	engines := make([]*testutils.MeshEngine, count)
   208  	for i, n := range nets {
   209  		eng := testutils.NewMeshEngine(suite.T(), n, 100, channels.TestNetworkChannel)
   210  		engines[i] = eng
   211  	}
   212  
   213  	// create the test engines
   214  	for i := 0; i < count; i++ {
   215  		node := testNode{
   216  			id:         ids[i],
   217  			libp2pNode: nodes[i],
   218  			network:    nets[i],
   219  			engine:     engines[i],
   220  		}
   221  		suite.testNodes.append(node)
   222  	}
   223  }
   224  
   225  // removeNode removes a randomly chosen test node from suite.testNodes and adds it to suite.removedTestNodes
   226  func (suite *MutableIdentityTableSuite) removeNode() testNode {
   227  	removedNode := suite.testNodes.remove()
   228  	suite.removedTestNodes.append(removedNode)
   229  	return removedNode
   230  }
   231  
   232  // TestNewNodeAdded tests that when a new node is added to the identity list e.g. on an epoch,
   233  // then it can connect to the network.
   234  func (suite *MutableIdentityTableSuite) TestNewNodeAdded() {
   235  
   236  	// add a new node the current list of nodes
   237  	suite.addNodes(1)
   238  
   239  	newNode, err := suite.testNodes.lastAdded()
   240  	require.NoError(suite.T(), err)
   241  	newID := newNode.id
   242  
   243  	suite.logger.Debug().
   244  		Str("new_node", newID.NodeID.String()).
   245  		Msg("added one node")
   246  
   247  	// update IDs for all the networks (simulating an epoch)
   248  	suite.signalIdentityChanged()
   249  
   250  	ids := suite.testNodes.ids()
   251  	engs := suite.testNodes.engines()
   252  
   253  	// check if the new node has sufficient connections with the existing nodes
   254  	// if it does, then it has been inducted successfully in the network
   255  	suite.assertConnected(newNode.libp2pNode, suite.testNodes.libp2pNodes())
   256  
   257  	// check that all the engines on this new epoch can talk to each other using any of the three networking primitives
   258  	suite.assertNetworkPrimitives(ids, engs, nil, nil)
   259  }
   260  
   261  // TestNodeRemoved tests that when an existing node is removed from the identity
   262  // list (ie. as a result of an ejection or transition into an epoch where that node
   263  // has un-staked) then it cannot connect to the network.
   264  func (suite *MutableIdentityTableSuite) TestNodeRemoved() {
   265  	// removed a node
   266  	removedNode := suite.removeNode()
   267  	removedID := removedNode.id
   268  	removedEngine := removedNode.engine
   269  
   270  	// update IDs for all the remaining nodes
   271  	// the removed node continues with the old identity list as we don't want to rely on it updating its ids list
   272  	suite.signalIdentityChanged()
   273  
   274  	remainingIDs := suite.testNodes.ids()
   275  	remainingEngs := suite.testNodes.engines()
   276  
   277  	// assert that the removed node has no connections with any of the other nodes
   278  	suite.assertDisconnected(removedNode.libp2pNode, suite.testNodes.libp2pNodes())
   279  
   280  	// check that all remaining engines can still talk to each other while the ones removed can't
   281  	// using any of the three networking primitives
   282  	removedIDs := []*flow.Identity{removedID}
   283  	removedEngines := []*testutils.MeshEngine{removedEngine}
   284  
   285  	// assert that all three network primitives still work
   286  	suite.assertNetworkPrimitives(remainingIDs, remainingEngs, removedIDs, removedEngines)
   287  }
   288  
   289  // TestNodesAddedAndRemoved tests that:
   290  // a. a newly added node can exchange messages with the existing nodes
   291  // b. a node that has has been removed cannot exchange messages with the existing nodes
   292  func (suite *MutableIdentityTableSuite) TestNodesAddedAndRemoved() {
   293  
   294  	// remove a node
   295  	removedNode := suite.removeNode()
   296  	removedID := removedNode.id
   297  	removedEngine := removedNode.engine
   298  
   299  	// add a node
   300  	suite.addNodes(1)
   301  	newNode, err := suite.testNodes.lastAdded()
   302  	require.NoError(suite.T(), err)
   303  
   304  	// update all current nodes
   305  	suite.signalIdentityChanged()
   306  
   307  	remainingIDs := suite.testNodes.ids()
   308  	remainingEngs := suite.testNodes.engines()
   309  
   310  	// check if the new node has sufficient connections with the existing nodes
   311  	suite.assertConnected(newNode.libp2pNode, suite.testNodes.libp2pNodes())
   312  
   313  	// assert that the removed node has no connections with any of the other nodes
   314  	suite.assertDisconnected(removedNode.libp2pNode, suite.testNodes.libp2pNodes())
   315  
   316  	// check that all remaining engines can still talk to each other while the ones removed can't
   317  	// using any of the three networking primitives
   318  	removedIDs := []*flow.Identity{removedID}
   319  	removedEngines := []*testutils.MeshEngine{removedEngine}
   320  
   321  	// assert that all three network primitives still work
   322  	suite.assertNetworkPrimitives(remainingIDs, remainingEngs, removedIDs, removedEngines)
   323  }
   324  
   325  // assertConnected checks that a libp2p node is directly connected
   326  // to at least half of the other nodes.
   327  func (suite *MutableIdentityTableSuite) assertConnected(thisNode p2p.LibP2PNode, allNodes []p2p.LibP2PNode) {
   328  	t := suite.T()
   329  	threshold := len(allNodes) / 2
   330  	require.Eventuallyf(t, func() bool {
   331  		connections := 0
   332  		for _, node := range allNodes {
   333  			if node == thisNode {
   334  				// we don't want to check if a node is connected to itself
   335  				continue
   336  			}
   337  			connected, err := thisNode.IsConnected(node.ID())
   338  			require.NoError(t, err)
   339  			if connected {
   340  				connections++
   341  			}
   342  		}
   343  		suite.logger.Debug().
   344  			Int("threshold", threshold).
   345  			Int("connections", connections).
   346  			Msg("current connection count")
   347  		return connections >= threshold
   348  	}, 5*time.Second, 100*time.Millisecond, "node is not connected to enough nodes")
   349  }
   350  
   351  // assertDisconnected checks that a libp2p node is not connected to any of the other nodes specified in the
   352  // ids list.
   353  func (suite *MutableIdentityTableSuite) assertDisconnected(thisNode p2p.LibP2PNode, allNodes []p2p.LibP2PNode) {
   354  	t := suite.T()
   355  	require.Eventuallyf(t, func() bool {
   356  		for _, node := range allNodes {
   357  			connected, err := thisNode.IsConnected(node.ID())
   358  			require.NoError(t, err)
   359  			if connected {
   360  				return false
   361  			}
   362  		}
   363  		return true
   364  	}, 5*time.Second, 100*time.Millisecond, "node is still connected")
   365  }
   366  
   367  // assertNetworkPrimitives asserts that allowed engines can exchange messages between themselves but not with the
   368  // disallowed engines using each of the three network primitives
   369  func (suite *MutableIdentityTableSuite) assertNetworkPrimitives(
   370  	allowedIDs flow.IdentityList,
   371  	allowedEngs []*testutils.MeshEngine,
   372  	disallowedIDs flow.IdentityList,
   373  	disallowedEngs []*testutils.MeshEngine) {
   374  	suite.Run("Publish", func() {
   375  		suite.exchangeMessages(allowedIDs, allowedEngs, disallowedIDs, disallowedEngs, suite.Publish, false)
   376  	})
   377  	suite.Run("Multicast", func() {
   378  		suite.exchangeMessages(allowedIDs, allowedEngs, disallowedIDs, disallowedEngs, suite.Multicast, false)
   379  	})
   380  	suite.Run("Unicast", func() {
   381  		// unicast send from or to a node that has been evicted should fail with an error
   382  		suite.exchangeMessages(allowedIDs, allowedEngs, disallowedIDs, disallowedEngs, suite.Unicast, true)
   383  	})
   384  }
   385  
   386  // exchangeMessages verifies that allowed engines can successfully exchange messages between them while disallowed
   387  // engines can't using the ConduitSendWrapperFunc network primitive
   388  func (suite *MutableIdentityTableSuite) exchangeMessages(
   389  	allowedIDs flow.IdentityList,
   390  	allowedEngs []*testutils.MeshEngine,
   391  	disallowedIDs flow.IdentityList,
   392  	disallowedEngs []*testutils.MeshEngine,
   393  	send testutils.ConduitSendWrapperFunc,
   394  	expectSendErrorForDisallowedIDs bool) {
   395  
   396  	// send a message from each of the allowed engine to the other allowed engines
   397  	for i, allowedEng := range allowedEngs {
   398  
   399  		fromID := allowedIDs[i].NodeID
   400  		targetIDs := allowedIDs.Filter(filter.Not(filter.HasNodeID[flow.Identity](allowedIDs[i].NodeID)))
   401  
   402  		err := suite.sendMessage(fromID, allowedEng, targetIDs, send)
   403  		require.NoError(suite.T(), err)
   404  	}
   405  
   406  	// send a message from each of the allowed engine to all of the disallowed engines
   407  	if len(disallowedEngs) > 0 {
   408  		for i, fromEng := range allowedEngs {
   409  
   410  			fromID := allowedIDs[i].NodeID
   411  			targetIDs := disallowedIDs
   412  
   413  			err := suite.sendMessage(fromID, fromEng, targetIDs, send)
   414  			if expectSendErrorForDisallowedIDs {
   415  				require.Error(suite.T(), err)
   416  			}
   417  		}
   418  	}
   419  
   420  	// send a message from each of the disallowed engine to each of the allowed engines
   421  	for i, fromEng := range disallowedEngs {
   422  
   423  		fromID := disallowedIDs[i].NodeID
   424  		targetIDs := allowedIDs
   425  
   426  		err := suite.sendMessage(fromID, fromEng, targetIDs, send)
   427  		if expectSendErrorForDisallowedIDs {
   428  			require.Error(suite.T(), err)
   429  		}
   430  	}
   431  
   432  	count := len(allowedEngs)
   433  	expectedMsgCnt := count - 1
   434  	wg := sync.WaitGroup{}
   435  	// fires a goroutine for each of the allowed engine to listen for incoming messages
   436  	for i := range allowedEngs {
   437  		wg.Add(expectedMsgCnt)
   438  		go func(e *testutils.MeshEngine) {
   439  			for x := 0; x < expectedMsgCnt; x++ {
   440  				<-e.Received
   441  				wg.Done()
   442  			}
   443  		}(allowedEngs[i])
   444  	}
   445  
   446  	// assert that all allowed engines received expectedMsgCnt number of messages
   447  	unittest.AssertReturnsBefore(suite.T(), wg.Wait, 5*time.Second)
   448  	// assert that all allowed engines received no other messages
   449  	for i := range allowedEngs {
   450  		assert.Empty(suite.T(), allowedEngs[i].Received)
   451  	}
   452  
   453  	// assert that the disallowed engines didn't receive any message
   454  	for i, eng := range disallowedEngs {
   455  		unittest.RequireNeverClosedWithin(suite.T(), eng.Received, time.Millisecond,
   456  			fmt.Sprintf("%s engine should not have recevied message", disallowedIDs[i]))
   457  	}
   458  }
   459  
   460  func (suite *MutableIdentityTableSuite) sendMessage(fromID flow.Identifier,
   461  	fromEngine *testutils.MeshEngine,
   462  	toIDs flow.IdentityList,
   463  	send testutils.ConduitSendWrapperFunc) error {
   464  
   465  	primitive := runtime.FuncForPC(reflect.ValueOf(send).Pointer()).Name()
   466  	event := &message.TestMessage{
   467  		Text: fmt.Sprintf("hello from node %s using %s", fromID.String(), primitive),
   468  	}
   469  
   470  	return send(event, fromEngine.Con, toIDs.NodeIDs()...)
   471  }