github.com/cilium/cilium@v1.16.2/pkg/clustermesh/clustermesh_test.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package clustermesh
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"os"
    10  	"path"
    11  	"sync"
    12  	"testing"
    13  
    14  	"github.com/cilium/hive/hivetest"
    15  	"github.com/sirupsen/logrus"
    16  	"github.com/stretchr/testify/assert"
    17  	"github.com/stretchr/testify/require"
    18  
    19  	"github.com/cilium/cilium/pkg/clustermesh/common"
    20  	"github.com/cilium/cilium/pkg/clustermesh/types"
    21  	cmutils "github.com/cilium/cilium/pkg/clustermesh/utils"
    22  	"github.com/cilium/cilium/pkg/identity/cache"
    23  	"github.com/cilium/cilium/pkg/ipcache"
    24  	"github.com/cilium/cilium/pkg/kvstore"
    25  	"github.com/cilium/cilium/pkg/kvstore/store"
    26  	"github.com/cilium/cilium/pkg/lock"
    27  	nodeStore "github.com/cilium/cilium/pkg/node/store"
    28  	nodeTypes "github.com/cilium/cilium/pkg/node/types"
    29  	"github.com/cilium/cilium/pkg/testutils"
    30  	testidentity "github.com/cilium/cilium/pkg/testutils/identity"
    31  )
    32  
    33  const (
    34  	localClusterID   = 99
    35  	localClusterName = "local"
    36  )
    37  
    38  type testObserver struct {
    39  	nodes      map[string]*nodeTypes.Node
    40  	nodesMutex lock.RWMutex
    41  }
    42  
    43  func newNodesObserver() *testObserver {
    44  	return &testObserver{nodes: make(map[string]*nodeTypes.Node)}
    45  }
    46  
    47  func (o *testObserver) NodeUpdated(no nodeTypes.Node) {
    48  	o.nodesMutex.Lock()
    49  	o.nodes[no.Fullname()] = &no
    50  	o.nodesMutex.Unlock()
    51  }
    52  
    53  func (o *testObserver) NodeDeleted(no nodeTypes.Node) {
    54  	o.nodesMutex.Lock()
    55  	delete(o.nodes, no.Fullname())
    56  	o.nodesMutex.Unlock()
    57  }
    58  
    59  func TestClusterMesh(t *testing.T) {
    60  	testutils.IntegrationTest(t)
    61  
    62  	var wg sync.WaitGroup
    63  	ctx, cancel := context.WithCancel(context.Background())
    64  	defer func() {
    65  		cancel()
    66  		wg.Wait()
    67  	}()
    68  
    69  	kvstore.SetupDummy(t, "etcd")
    70  
    71  	// The nils are only used by k8s CRD identities. We default to kvstore.
    72  	mgr := cache.NewCachingIdentityAllocator(&testidentity.IdentityAllocatorOwnerMock{})
    73  	<-mgr.InitIdentityAllocator(nil)
    74  	t.Cleanup(mgr.Close)
    75  
    76  	dir := t.TempDir()
    77  	etcdConfig := []byte(fmt.Sprintf("endpoints:\n- %s\n", kvstore.EtcdDummyAddress()))
    78  
    79  	// cluster3 doesn't have cluster configuration on kvstore.
    80  	// We should not be able to establish a connection in this case.
    81  	for i, name := range []string{"test2", "cluster1", "cluster2"} {
    82  		config := types.CiliumClusterConfig{
    83  			ID: uint32(i + 1),
    84  			Capabilities: types.CiliumClusterConfigCapabilities{
    85  				MaxConnectedClusters: 255,
    86  			},
    87  		}
    88  
    89  		if name == "cluster2" {
    90  			// Cluster2 supports synced canaries
    91  			config.Capabilities.SyncedCanaries = true
    92  		}
    93  
    94  		err := cmutils.SetClusterConfig(ctx, name, config, kvstore.Client())
    95  		require.NoErrorf(t, err, "Failed to set cluster config for %s", name)
    96  	}
    97  
    98  	config1 := path.Join(dir, "cluster1")
    99  	require.NoError(t, os.WriteFile(config1, etcdConfig, 0644), "Failed to write config file for cluster1")
   100  
   101  	config2 := path.Join(dir, "cluster2")
   102  	require.NoError(t, os.WriteFile(config2, etcdConfig, 0644), "Failed to write config file for cluster2")
   103  
   104  	config3 := path.Join(dir, "cluster3")
   105  	require.NoError(t, os.WriteFile(config3, etcdConfig, 0644), "Failed to write config file for cluster3")
   106  
   107  	ipc := ipcache.NewIPCache(&ipcache.Configuration{
   108  		Context: ctx,
   109  	})
   110  	t.Cleanup(func() { ipc.Shutdown() })
   111  
   112  	usedIDs := NewClusterMeshUsedIDs(localClusterID)
   113  	storeFactory := store.NewFactory(store.MetricsProvider())
   114  	nodesObserver := newNodesObserver()
   115  	cm := NewClusterMesh(hivetest.Lifecycle(t), Configuration{
   116  		Config:                common.Config{ClusterMeshConfig: dir},
   117  		ClusterInfo:           types.ClusterInfo{ID: localClusterID, Name: localClusterName, MaxConnectedClusters: 255},
   118  		NodeObserver:          nodesObserver,
   119  		RemoteIdentityWatcher: mgr,
   120  		IPCache:               ipc,
   121  		ClusterIDsManager:     usedIDs,
   122  		Metrics:               NewMetrics(),
   123  		CommonMetrics:         common.MetricsProvider(subsystem)(),
   124  		StoreFactory:          storeFactory,
   125  		Logger:                logrus.New(),
   126  	})
   127  	require.NotNil(t, cm, "Failed to initialize clustermesh")
   128  	// cluster2 is the cluster which is tested with sync canaries
   129  	nodesWSS := storeFactory.NewSyncStore("cluster2", kvstore.Client(), nodeStore.NodeStorePrefix)
   130  	wg.Add(1)
   131  	go func() {
   132  		nodesWSS.Run(ctx)
   133  		wg.Done()
   134  	}()
   135  	nodeNames := []string{"foo", "bar", "baz"}
   136  
   137  	// wait for the two expected clusters to appear in the list of cm clusters
   138  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   139  		assert.Equal(c, 2, cm.NumReadyClusters())
   140  	}, timeout, tick, "Clusters did not become ready in time")
   141  
   142  	// Ensure that ClusterIDs are reserved correctly after connect
   143  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   144  		usedIDs.UsedClusterIDsMutex.Lock()
   145  		defer usedIDs.UsedClusterIDsMutex.Unlock()
   146  
   147  		assert.Contains(c, usedIDs.UsedClusterIDs, uint32(2))
   148  		assert.Contains(c, usedIDs.UsedClusterIDs, uint32(3))
   149  		assert.Len(c, usedIDs.UsedClusterIDs, 2)
   150  	}, timeout, tick, "Cluster IDs were not reserved correctly")
   151  
   152  	// Reconnect cluster with changed ClusterID
   153  	config := types.CiliumClusterConfig{
   154  		ID: 255,
   155  		Capabilities: types.CiliumClusterConfigCapabilities{
   156  			MaxConnectedClusters: 255,
   157  		},
   158  	}
   159  	err := cmutils.SetClusterConfig(ctx, "cluster1", config, kvstore.Client())
   160  	require.NoErrorf(t, err, "Failed to set cluster config for cluster1")
   161  	// Ugly hack to trigger config update
   162  	etcdConfigNew := append(etcdConfig, []byte("\n")...)
   163  	require.NoError(t, os.WriteFile(config1, etcdConfigNew, 0644), "Failed to write config file for cluster1")
   164  
   165  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   166  		usedIDs.UsedClusterIDsMutex.Lock()
   167  		defer usedIDs.UsedClusterIDsMutex.Unlock()
   168  
   169  		// Ensure if old ClusterID for cluster1 is released
   170  		// and new ClusterID is reserved.
   171  		assert.NotContains(c, usedIDs.UsedClusterIDs, uint32(2))
   172  		assert.Contains(c, usedIDs.UsedClusterIDs, uint32(255))
   173  	}, timeout, tick, "Reserved cluster IDs not updated correctly")
   174  
   175  	for cluster, id := range map[string]uint32{"cluster1": 255, "cluster2": 3, "cluster3": 4} {
   176  		for _, name := range nodeNames {
   177  			require.NoErrorf(t, nodesWSS.UpsertKey(ctx, &nodeTypes.Node{Name: name, Cluster: cluster, ClusterID: id}),
   178  				"Failed upserting node %s/%s into kvstore", cluster, name)
   179  		}
   180  	}
   181  
   182  	// Write the sync canary for cluster2
   183  	require.NoError(t, nodesWSS.Synced(ctx), "Failed writing the synched key into kvstore")
   184  
   185  	// wait for all cm nodes in both clusters to appear in the node list
   186  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   187  		nodesObserver.nodesMutex.RLock()
   188  		defer nodesObserver.nodesMutex.RUnlock()
   189  		assert.Len(c, nodesObserver.nodes, 2*len(nodeNames))
   190  	}, timeout, tick, "Nodes not watched correctly")
   191  
   192  	require.NoError(t, os.Remove(config2), "Failed to remove config file for cluster2")
   193  
   194  	// wait for the removed cluster to disappear
   195  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   196  		assert.Equal(c, 1, cm.NumReadyClusters())
   197  	}, timeout, tick, "Cluster2 was not correctly removed")
   198  
   199  	// Make sure that ID is freed
   200  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   201  		usedIDs.UsedClusterIDsMutex.Lock()
   202  		defer usedIDs.UsedClusterIDsMutex.Unlock()
   203  		assert.NotContains(c, usedIDs.UsedClusterIDs, uint32(2))
   204  		assert.Len(c, usedIDs.UsedClusterIDs, 1)
   205  	}, timeout, tick, "Cluster IDs were not freed correctly")
   206  
   207  	// wait for the nodes of the removed cluster to disappear
   208  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   209  		nodesObserver.nodesMutex.RLock()
   210  		defer nodesObserver.nodesMutex.RUnlock()
   211  		assert.Len(c, nodesObserver.nodes, 1*len(nodeNames))
   212  	}, timeout, tick, "Nodes were not drained correctly")
   213  
   214  	require.NoError(t, os.Remove(config1), "Failed to remove config file for cluster1")
   215  	require.NoError(t, os.Remove(config3), "Failed to remove config file for cluster3")
   216  
   217  	// wait for the removed cluster to disappear
   218  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   219  		assert.Equal(c, 0, cm.NumReadyClusters())
   220  	}, timeout, tick, "Clusters were not correctly removed")
   221  
   222  	// wait for the nodes of the removed cluster to disappear
   223  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   224  		nodesObserver.nodesMutex.RLock()
   225  		defer nodesObserver.nodesMutex.RUnlock()
   226  		assert.Len(c, nodesObserver.nodes, 0)
   227  	}, timeout, tick, "Nodes were not drained correctly")
   228  
   229  	// Make sure that IDs are freed
   230  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   231  		usedIDs.UsedClusterIDsMutex.Lock()
   232  		defer usedIDs.UsedClusterIDsMutex.Unlock()
   233  		assert.Len(c, usedIDs.UsedClusterIDs, 0)
   234  	}, timeout, tick, "Cluster IDs were not freed correctly")
   235  }