github.com/cilium/cilium@v1.16.2/pkg/clustermesh/kvstoremesh/kvstoremesh_test.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package kvstoremesh
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"os"
    11  	"path"
    12  	"path/filepath"
    13  	"strings"
    14  	"sync"
    15  	"testing"
    16  	"time"
    17  
    18  	"github.com/cilium/hive/cell"
    19  	"github.com/cilium/hive/hivetest"
    20  	"github.com/sirupsen/logrus"
    21  	"github.com/stretchr/testify/assert"
    22  	"github.com/stretchr/testify/require"
    23  	baseclocktest "k8s.io/utils/clock/testing"
    24  
    25  	"github.com/cilium/cilium/api/v1/models"
    26  	"github.com/cilium/cilium/clustermesh-apiserver/syncstate"
    27  	"github.com/cilium/cilium/pkg/clustermesh/common"
    28  	"github.com/cilium/cilium/pkg/clustermesh/types"
    29  	"github.com/cilium/cilium/pkg/clustermesh/utils"
    30  	"github.com/cilium/cilium/pkg/hive"
    31  	"github.com/cilium/cilium/pkg/inctimer"
    32  	"github.com/cilium/cilium/pkg/kvstore"
    33  	"github.com/cilium/cilium/pkg/kvstore/store"
    34  	"github.com/cilium/cilium/pkg/lock"
    35  	"github.com/cilium/cilium/pkg/logging/logfields"
    36  	"github.com/cilium/cilium/pkg/promise"
    37  	"github.com/cilium/cilium/pkg/testutils"
    38  )
    39  
    40  // Configure a generous timeout to prevent flakes when running in a noisy CI environment.
    41  var (
    42  	tick    = 10 * time.Millisecond
    43  	timeout = 5 * time.Second
    44  )
    45  
    46  type remoteEtcdClientWrapper struct {
    47  	kvstore.BackendOperations
    48  	name   string
    49  	cached bool
    50  
    51  	kvs map[string]string
    52  	mu  lock.Mutex
    53  
    54  	syncedCanariesWatched bool
    55  }
    56  
    57  // Override the ListAndWatch method so that we can propagate whatever event we want without key conflicts with
    58  // those eventually created by kvstoremesh. Additionally, this also allows to track which prefixes have been watched.
    59  func (w *remoteEtcdClientWrapper) ListAndWatch(ctx context.Context, prefix string, chanSize int) *kvstore.Watcher {
    60  	events := make(kvstore.EventChan, 10)
    61  
    62  	w.mu.Lock()
    63  	defer w.mu.Unlock()
    64  
    65  	if prefix == fmt.Sprintf("cilium/synced/%s/", w.name) {
    66  		state := "state"
    67  		if w.cached {
    68  			state = "cache"
    69  		}
    70  
    71  		w.syncedCanariesWatched = true
    72  		events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/nodes/v1", w.name, state)}
    73  		events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/services/v1", w.name, state)}
    74  		events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/identities/v1", w.name, state)}
    75  		events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/ip/v1", w.name, state)}
    76  	} else {
    77  		for key, value := range w.kvs {
    78  			var found bool
    79  			if strings.HasPrefix(key, prefix) {
    80  				events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: key, Value: []byte(value)}
    81  				found = true
    82  				delete(w.kvs, key)
    83  			}
    84  
    85  			if found {
    86  				events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeListDone}
    87  			}
    88  		}
    89  	}
    90  
    91  	go func() {
    92  		<-ctx.Done()
    93  		close(events)
    94  	}()
    95  
    96  	return &kvstore.Watcher{Events: events}
    97  }
    98  
    99  func clockAdvance(t assert.TestingT, fc *baseclocktest.FakeClock, d time.Duration) {
   100  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
   101  	defer cancel()
   102  
   103  	timer, stop := inctimer.New()
   104  	defer stop()
   105  
   106  	for !fc.HasWaiters() {
   107  		select {
   108  		case <-ctx.Done():
   109  			assert.FailNow(t, "Could not advance clock within expected timeout")
   110  		case <-timer.After(1 * time.Millisecond):
   111  		}
   112  	}
   113  
   114  	fc.Step(d)
   115  }
   116  
   117  func TestRemoteClusterRun(t *testing.T) {
   118  	testutils.IntegrationTest(t)
   119  
   120  	kvstore.SetupDummyWithConfigOpts(t, "etcd",
   121  		// Explicitly set higher QPS than the default to speedup the test
   122  		map[string]string{kvstore.EtcdRateLimitOption: "100"},
   123  	)
   124  
   125  	tests := []struct {
   126  		name   string
   127  		srccfg types.CiliumClusterConfig
   128  		dstcfg types.CiliumClusterConfig
   129  		kvs    map[string]string
   130  	}{
   131  		{
   132  			name:   "remote cluster has empty cluster config",
   133  			srccfg: types.CiliumClusterConfig{},
   134  			dstcfg: types.CiliumClusterConfig{
   135  				Capabilities: types.CiliumClusterConfigCapabilities{
   136  					SyncedCanaries: true,
   137  					Cached:         true,
   138  				},
   139  			},
   140  			kvs: map[string]string{
   141  				"cilium/state/nodes/v1/foo/bar":    "qux1",
   142  				"cilium/state/services/v1/foo/bar": "qux2",
   143  				"cilium/state/identities/v1/bar":   "qux3",
   144  				"cilium/state/ip/v1/default/bar":   "qux4",
   145  			},
   146  		},
   147  		{
   148  			name: "remote cluster supports the synced canaries",
   149  			srccfg: types.CiliumClusterConfig{
   150  				Capabilities: types.CiliumClusterConfigCapabilities{
   151  					SyncedCanaries: true,
   152  				},
   153  			},
   154  			dstcfg: types.CiliumClusterConfig{
   155  				Capabilities: types.CiliumClusterConfigCapabilities{
   156  					SyncedCanaries: true,
   157  					Cached:         true,
   158  				},
   159  			},
   160  			kvs: map[string]string{
   161  				"cilium/state/nodes/v1/foo/bar":    "qux1",
   162  				"cilium/state/services/v1/foo/bar": "qux2",
   163  				"cilium/state/identities/v1/bar":   "qux3",
   164  				"cilium/state/ip/v1/default/bar":   "qux4",
   165  			},
   166  		},
   167  		{
   168  			name: "remote cluster supports the cached prefixes",
   169  			srccfg: types.CiliumClusterConfig{
   170  				ID: 10,
   171  				Capabilities: types.CiliumClusterConfigCapabilities{
   172  					Cached: true,
   173  				},
   174  			},
   175  			dstcfg: types.CiliumClusterConfig{
   176  				ID: 10,
   177  				Capabilities: types.CiliumClusterConfigCapabilities{
   178  					SyncedCanaries: true,
   179  					Cached:         true,
   180  				},
   181  			},
   182  			kvs: map[string]string{
   183  				"cilium/cache/nodes/v1/foo/bar":      "qux1",
   184  				"cilium/cache/services/v1/foo/bar":   "qux2",
   185  				"cilium/cache/identities/v1/foo/bar": "qux3",
   186  				"cilium/cache/ip/v1/foo/bar":         "qux4",
   187  			},
   188  		},
   189  		{
   190  			name: "remote cluster supports both synced canaries and cached prefixes",
   191  			srccfg: types.CiliumClusterConfig{
   192  				ID: 10,
   193  				Capabilities: types.CiliumClusterConfigCapabilities{
   194  					SyncedCanaries: true,
   195  					Cached:         true,
   196  				},
   197  			},
   198  			dstcfg: types.CiliumClusterConfig{
   199  				ID: 10,
   200  				Capabilities: types.CiliumClusterConfigCapabilities{
   201  					SyncedCanaries: true,
   202  					Cached:         true,
   203  				},
   204  			},
   205  			kvs: map[string]string{
   206  				"cilium/cache/nodes/v1/foo/bar":      "qux1",
   207  				"cilium/cache/services/v1/foo/bar":   "qux2",
   208  				"cilium/cache/identities/v1/foo/bar": "qux3",
   209  				"cilium/cache/ip/v1/foo/bar":         "qux4",
   210  			},
   211  		},
   212  	}
   213  
   214  	for _, tt := range tests {
   215  		t.Run(tt.name, func(t *testing.T) {
   216  			var wg sync.WaitGroup
   217  			ctx, cancel := context.WithCancel(context.Background())
   218  
   219  			t.Cleanup(func() {
   220  				cancel()
   221  				wg.Wait()
   222  
   223  				require.NoError(t, kvstore.Client().DeletePrefix(context.Background(), kvstore.BaseKeyPrefix))
   224  			})
   225  
   226  			remoteClient := &remoteEtcdClientWrapper{
   227  				BackendOperations: kvstore.Client(),
   228  				name:              "foo",
   229  				cached:            tt.srccfg.Capabilities.Cached,
   230  				kvs:               tt.kvs,
   231  			}
   232  
   233  			st := store.NewFactory(store.MetricsProvider())
   234  			fakeclock := baseclocktest.NewFakeClock(time.Now())
   235  			km := KVStoreMesh{backend: kvstore.Client(), storeFactory: st, logger: logrus.New(), clock: fakeclock}
   236  
   237  			rc := km.newRemoteCluster("foo", nil)
   238  			ready := make(chan error)
   239  
   240  			wg.Add(1)
   241  			go func() {
   242  				rc.Run(ctx, remoteClient, tt.srccfg, ready)
   243  				rc.Stop()
   244  				wg.Done()
   245  			}()
   246  
   247  			require.NoError(t, <-ready, "rc.Run() failed")
   248  
   249  			// Assert that the cluster config got properly propagated
   250  			require.EventuallyWithT(t, func(c *assert.CollectT) {
   251  				cfg, err := utils.GetClusterConfig(ctx, "foo", kvstore.Client())
   252  				assert.NoError(c, err)
   253  				assert.Equal(c, tt.dstcfg, cfg)
   254  			}, timeout, tick, "Failed to retrieve the cluster config")
   255  
   256  			// Assert that the keys have been properly reflected
   257  			for key, value := range map[string]string{
   258  				"cilium/cache/nodes/v1/foo/bar":      "qux1",
   259  				"cilium/cache/services/v1/foo/bar":   "qux2",
   260  				"cilium/cache/identities/v1/foo/bar": "qux3",
   261  				"cilium/cache/ip/v1/foo/bar":         "qux4",
   262  			} {
   263  				require.EventuallyWithTf(t, func(c *assert.CollectT) {
   264  					v, err := kvstore.Client().Get(ctx, key)
   265  					assert.NoError(c, err)
   266  					assert.Equal(c, value, string(v))
   267  				}, timeout, tick, "Expected key %q does not seem to have the correct value %q", key, value)
   268  			}
   269  
   270  			// Assert that the sync canaries have been properly set
   271  			for _, key := range []string{
   272  				"cilium/synced/foo/cilium/cache/nodes/v1",
   273  				"cilium/synced/foo/cilium/cache/services/v1",
   274  				"cilium/synced/foo/cilium/cache/identities/v1",
   275  				"cilium/synced/foo/cilium/cache/ip/v1",
   276  			} {
   277  				require.EventuallyWithTf(t, func(c *assert.CollectT) {
   278  					v, err := kvstore.Client().Get(ctx, key)
   279  					assert.NoError(c, err)
   280  					assert.NotEmpty(c, string(v))
   281  				}, timeout, tick, "Expected sync canary %q is not correctly present", key)
   282  			}
   283  
   284  			// Assert that synced canaries have been watched if expected
   285  			require.Equal(t, tt.srccfg.Capabilities.SyncedCanaries, remoteClient.syncedCanariesWatched)
   286  
   287  			cancel()
   288  			wg.Wait()
   289  
   290  			// rc.Remove waits for a 3 minutes grace period before proceeding
   291  			// with the deletion. Let's handle that by advancing the fake time.
   292  			go clockAdvance(t, fakeclock, 3*time.Minute)
   293  
   294  			// Assert that Remove() removes all keys previously created
   295  			rc.Remove(context.Background())
   296  
   297  			pairs, err := kvstore.Client().ListPrefix(context.Background(), kvstore.BaseKeyPrefix)
   298  			require.NoError(t, err, "Failed to retrieve kvstore keys")
   299  			require.Empty(t, pairs, "Cached keys not correctly removed")
   300  		})
   301  	}
   302  }
   303  
   304  type localClientWrapper struct {
   305  	kvstore.BackendOperations
   306  	errors map[string]uint
   307  }
   308  
   309  func (lcw *localClientWrapper) Delete(ctx context.Context, key string) error {
   310  	if cnt := lcw.errors[key]; cnt > 0 {
   311  		lcw.errors[key] = cnt - 1
   312  		return errors.New("fake error")
   313  	}
   314  
   315  	return lcw.BackendOperations.Delete(ctx, key)
   316  }
   317  
   318  func (lcw *localClientWrapper) DeletePrefix(ctx context.Context, path string) error {
   319  	if cnt := lcw.errors[path]; cnt > 0 {
   320  		lcw.errors[path] = cnt - 1
   321  		return errors.New("fake error")
   322  	}
   323  
   324  	return lcw.BackendOperations.DeletePrefix(ctx, path)
   325  }
   326  
   327  func TestRemoteClusterRemove(t *testing.T) {
   328  	testutils.IntegrationTest(t)
   329  
   330  	ctx := context.Background()
   331  	kvstore.SetupDummyWithConfigOpts(t, "etcd",
   332  		// Explicitly set higher QPS than the default to speedup the test
   333  		map[string]string{kvstore.EtcdRateLimitOption: "100"},
   334  	)
   335  
   336  	keys := func(name string) []string {
   337  		return []string{
   338  			fmt.Sprintf("cilium/cluster-config/%s", name),
   339  			fmt.Sprintf("cilium/synced/%s/cilium/cache/nodes/v1", name),
   340  			fmt.Sprintf("cilium/synced/%s/cilium/cache/services/v1", name),
   341  			fmt.Sprintf("cilium/synced/%s/cilium/cache/identities/v1", name),
   342  			fmt.Sprintf("cilium/synced/%s/cilium/cache/ip/v1", name),
   343  			fmt.Sprintf("cilium/cache/nodes/v1/%s/bar", name),
   344  			fmt.Sprintf("cilium/cache/services/v1/%s/bar", name),
   345  			fmt.Sprintf("cilium/cache/identities/v1/%s/bar", name),
   346  			fmt.Sprintf("cilium/cache/ip/v1/%s/bar", name),
   347  		}
   348  	}
   349  
   350  	wrapper := &localClientWrapper{
   351  		BackendOperations: kvstore.Client(),
   352  		errors: map[string]uint{
   353  			"cilium/cache/identities/v1/foobar/": 1,
   354  			"cilium/cluster-config/baz":          10,
   355  		},
   356  	}
   357  
   358  	st := store.NewFactory(store.MetricsProvider())
   359  	fakeclock := baseclocktest.NewFakeClock(time.Now())
   360  	km := KVStoreMesh{backend: wrapper, storeFactory: st, logger: logrus.New(), clock: fakeclock}
   361  	rcs := make(map[string]*remoteCluster)
   362  	for _, cluster := range []string{"foo", "foobar", "baz"} {
   363  		rcs[cluster] = km.newRemoteCluster(cluster, nil).(*remoteCluster)
   364  		rcs[cluster].Stop()
   365  	}
   366  
   367  	for _, rc := range rcs {
   368  		for _, key := range keys(rc.name) {
   369  			require.NoError(t, kvstore.Client().Update(ctx, key, []byte("value"), false))
   370  		}
   371  	}
   372  
   373  	var wg sync.WaitGroup
   374  	bgrun := func(ctx context.Context, fn func(context.Context)) {
   375  		wg.Add(1)
   376  		go func() {
   377  			fn(ctx)
   378  			wg.Done()
   379  		}()
   380  	}
   381  
   382  	assertDeleted := func(t assert.TestingT, ctx context.Context, key string) {
   383  		value, err := kvstore.Client().Get(ctx, key)
   384  		assert.NoError(t, err, "Failed to retrieve kvstore key %s", key)
   385  		assert.Empty(t, string(value), "Key %s has not been deleted", key)
   386  	}
   387  
   388  	assertNotDeleted := func(t assert.TestingT, ctx context.Context, key string) {
   389  		value, err := kvstore.Client().Get(ctx, key)
   390  		assert.NoError(t, err, "Failed to retrieve kvstore key %s", key)
   391  		assert.NotEmpty(t, string(value), "Key %s has been incorrectly deleted", key)
   392  	}
   393  
   394  	// Remove should only delete the cluster config key before grace period expiration
   395  	bgrun(ctx, rcs["foo"].Remove)
   396  	assert.EventuallyWithT(t, func(c *assert.CollectT) {
   397  		assertDeleted(c, ctx, keys("foo")[0])
   398  		for _, key := range keys("foo")[1:] {
   399  			assertNotDeleted(c, ctx, key)
   400  		}
   401  	}, timeout, tick)
   402  
   403  	clockAdvance(t, fakeclock, 3*time.Minute-1*time.Millisecond)
   404  
   405  	// Grace period should still not have expired
   406  	time.Sleep(tick)
   407  	for _, key := range keys("foo")[1:] {
   408  		assertNotDeleted(t, ctx, key)
   409  	}
   410  
   411  	clockAdvance(t, fakeclock, 1*time.Millisecond)
   412  	wg.Wait()
   413  
   414  	// Grace period expired, all keys should now have been deleted
   415  	for _, key := range keys("foo") {
   416  		assertDeleted(t, ctx, key)
   417  	}
   418  
   419  	// Keys of other clusters should not have been touched
   420  	for _, cluster := range []string{"foobar", "baz"} {
   421  		for _, key := range keys(cluster) {
   422  			assertNotDeleted(t, ctx, key)
   423  		}
   424  	}
   425  
   426  	// Simulate the failure of one of the delete calls
   427  	bgrun(ctx, rcs["foobar"].Remove)
   428  
   429  	clockAdvance(t, fakeclock, 3*time.Minute)
   430  	assert.EventuallyWithT(t, func(c *assert.CollectT) {
   431  		// Only the keys up to the erroring one should have been deleted
   432  		for _, key := range keys("foobar")[0:7] {
   433  			assertDeleted(c, ctx, key)
   434  		}
   435  		for _, key := range keys("foobar")[7:] {
   436  			assertNotDeleted(c, ctx, key)
   437  		}
   438  	}, timeout, tick)
   439  
   440  	clockAdvance(t, fakeclock, 2*time.Second-1*time.Millisecond)
   441  	time.Sleep(tick)
   442  	for _, key := range keys("foobar")[7:] {
   443  		// Backoff should not have expired yet
   444  		assertNotDeleted(t, ctx, key)
   445  	}
   446  
   447  	clockAdvance(t, fakeclock, 1*time.Millisecond)
   448  	wg.Wait()
   449  
   450  	for _, key := range keys("foobar") {
   451  		// Backoff expired, all keys should have been deleted
   452  		assertDeleted(t, ctx, key)
   453  	}
   454  
   455  	// Simulate the persistent failure of one of the delete calls
   456  	bgrun(ctx, rcs["baz"].Remove)
   457  
   458  	clockAdvance(t, fakeclock, 2*time.Second)  // First retry
   459  	clockAdvance(t, fakeclock, 4*time.Second)  // Second retry
   460  	clockAdvance(t, fakeclock, 8*time.Second)  // Third retry
   461  	clockAdvance(t, fakeclock, 16*time.Second) // Forth retry
   462  
   463  	// Fifth and last retry
   464  	clockAdvance(t, fakeclock, 32*time.Second-1*time.Millisecond)
   465  
   466  	// Make sure that Remove() is still actually waiting. If it weren't,
   467  	// clockAdvance couldn't complete successfully.
   468  	clockAdvance(t, fakeclock, 1*time.Millisecond)
   469  	wg.Wait()
   470  
   471  	for _, key := range keys("baz") {
   472  		// All keys should not have been deleted due to the persistent error
   473  		assertNotDeleted(t, ctx, key)
   474  	}
   475  
   476  	// The context expired during grace period
   477  	cctx, cancel := context.WithCancel(context.Background())
   478  	bgrun(cctx, rcs["foo"].Remove)
   479  	clockAdvance(t, fakeclock, 1*time.Minute)
   480  	cancel()
   481  	wg.Wait()
   482  
   483  	// Remove the existing waiter that we didn't clean-up due to context termination.
   484  	if fakeclock.HasWaiters() {
   485  		fakeclock.Step(5 * time.Minute)
   486  	}
   487  
   488  	// The context expired during backoff
   489  	cctx, cancel = context.WithCancel(context.Background())
   490  	bgrun(cctx, rcs["baz"].Remove)
   491  	clockAdvance(t, fakeclock, 1*time.Minute)
   492  	cancel()
   493  	wg.Wait()
   494  
   495  	// Remove the existing waiter that we didn't clean-up due to context termination.
   496  	if fakeclock.HasWaiters() {
   497  		fakeclock.Step(5 * time.Minute)
   498  	}
   499  }
   500  
   501  func TestRemoteClusterRemoveShutdown(t *testing.T) {
   502  	// Test that KVStoreMesh shutdown process is not blocked by possible
   503  	// in-progress remote cluster removals.
   504  	testutils.IntegrationTest(t)
   505  
   506  	ctx := context.Background()
   507  	kvstore.SetupDummyWithConfigOpts(t, "etcd",
   508  		// Explicitly set higher QPS than the default to speedup the test
   509  		map[string]string{kvstore.EtcdRateLimitOption: "100"},
   510  	)
   511  
   512  	dir := t.TempDir()
   513  	cfg := []byte(fmt.Sprintf("endpoints:\n- %s\n", kvstore.EtcdDummyAddress()))
   514  	require.NoError(t, os.WriteFile(filepath.Join(dir, "remote"), cfg, 0644))
   515  
   516  	// Let's manually create a fake cluster configuration for the remote cluster,
   517  	// because we are using the same kvstore. This will be used as a synchronization
   518  	// point to stop the hive while blocked waiting for the grace period.
   519  	require.NoError(t, utils.SetClusterConfig(ctx, "remote", types.CiliumClusterConfig{ID: 20}, kvstore.Client()))
   520  
   521  	var km *KVStoreMesh
   522  	h := hive.New(
   523  		Cell,
   524  
   525  		cell.Provide(
   526  			func() types.ClusterInfo { return types.ClusterInfo{ID: 10, Name: "local"} },
   527  			func() Config { return Config{} },
   528  			func() promise.Promise[kvstore.BackendOperations] {
   529  				clr, clp := promise.New[kvstore.BackendOperations]()
   530  				clr.Resolve(kvstore.Client())
   531  				return clp
   532  			},
   533  		),
   534  
   535  		cell.Invoke(func(km_ *KVStoreMesh) { km = km_ }),
   536  	)
   537  	hive.AddConfigOverride(h, func(cfg *common.Config) { cfg.ClusterMeshConfig = dir })
   538  
   539  	tlog := hivetest.Logger(t)
   540  	require.NoError(t, h.Start(tlog, ctx), "Failed to start the hive")
   541  
   542  	// Wait until the connection has been successfully established, before disconnecting.
   543  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   544  		status := km.status()
   545  		if assert.Len(c, status, 1) {
   546  			assert.True(c, status[0].Ready)
   547  		}
   548  	}, timeout, tick, "Failed to connect to the remote cluster")
   549  
   550  	require.NoError(t, os.Remove(filepath.Join(dir, "remote")))
   551  
   552  	// Wait until the cluster config key has been removed, to ensure that we are
   553  	// actually waiting for the grace period expiration.
   554  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   555  		key := path.Join(kvstore.ClusterConfigPrefix, "remote")
   556  		value, err := kvstore.Client().Get(ctx, key)
   557  		assert.NoError(c, err, "Failed to retrieve kvstore key %s", key)
   558  		assert.Empty(c, string(value), "Key %s has not been deleted", key)
   559  	}, timeout, tick)
   560  
   561  	sctx, cancel := context.WithTimeout(ctx, 1*time.Second)
   562  	defer cancel()
   563  	require.NoError(t, h.Stop(tlog, sctx), "Failed to stop the hive")
   564  }
   565  
   566  func TestRemoteClusterStatus(t *testing.T) {
   567  	testutils.IntegrationTest(t)
   568  
   569  	kvstore.SetupDummy(t, "etcd")
   570  
   571  	var wg sync.WaitGroup
   572  	ctx, cancel := context.WithCancel(context.Background())
   573  
   574  	t.Cleanup(func() {
   575  		cancel()
   576  		wg.Wait()
   577  
   578  		require.NoError(t, kvstore.Client().DeletePrefix(context.Background(), kvstore.BaseKeyPrefix))
   579  	})
   580  
   581  	remoteClient := &remoteEtcdClientWrapper{
   582  		BackendOperations: kvstore.Client(),
   583  		name:              "foo",
   584  		kvs: map[string]string{
   585  			"cilium/state/nodes/v1/foo/bar":    "qux0",
   586  			"cilium/state/nodes/v1/foo/baz":    "qux1",
   587  			"cilium/state/services/v1/foo/bar": "qux2",
   588  			"cilium/state/services/v1/foo/baz": "qux3",
   589  			"cilium/state/services/v1/foo/qux": "qux4",
   590  			"cilium/state/identities/v1/bar":   "qux5",
   591  			"cilium/state/ip/v1/default/fred":  "qux6",
   592  			"cilium/state/ip/v1/default/bar":   "qux7",
   593  			"cilium/state/ip/v1/default/baz":   "qux8",
   594  			"cilium/state/ip/v1/default/qux":   "qux9",
   595  		},
   596  	}
   597  	st := store.NewFactory(store.MetricsProvider())
   598  	km := KVStoreMesh{backend: kvstore.Client(), storeFactory: st, logger: logrus.New()}
   599  
   600  	rc := km.newRemoteCluster("foo", func() *models.RemoteCluster {
   601  		return &models.RemoteCluster{Ready: true}
   602  	})
   603  	cfg := types.CiliumClusterConfig{
   604  		ID: 10, Capabilities: types.CiliumClusterConfigCapabilities{SyncedCanaries: true},
   605  	}
   606  	ready := make(chan error)
   607  
   608  	// Validate the status before watching the remote cluster.
   609  	status := rc.(*remoteCluster).Status()
   610  	require.False(t, status.Ready, "Status should not be ready")
   611  
   612  	require.False(t, status.Synced.Nodes, "Nodes should not be synced")
   613  	require.False(t, status.Synced.Services, "Services should not be synced")
   614  	require.False(t, status.Synced.Identities, "Identities should not be synced")
   615  	require.False(t, status.Synced.Endpoints, "Endpoints should not be synced")
   616  
   617  	require.EqualValues(t, 0, status.NumNodes, "Incorrect number of nodes")
   618  	require.EqualValues(t, 0, status.NumSharedServices, "Incorrect number of services")
   619  	require.EqualValues(t, 0, status.NumIdentities, "Incorrect number of identities")
   620  	require.EqualValues(t, 0, status.NumEndpoints, "Incorrect number of endpoints")
   621  
   622  	wg.Add(1)
   623  	go func() {
   624  		rc.Run(ctx, remoteClient, cfg, ready)
   625  		rc.Stop()
   626  		wg.Done()
   627  	}()
   628  
   629  	require.NoError(t, <-ready, "rc.Run() failed")
   630  
   631  	require.EventuallyWithT(t, func(c *assert.CollectT) {
   632  		status := rc.(*remoteCluster).Status()
   633  		assert.True(c, status.Ready, "Status should be ready")
   634  
   635  		assert.True(c, status.Synced.Nodes, "Nodes should be synced")
   636  		assert.True(c, status.Synced.Services, "Services should be synced")
   637  		assert.True(c, status.Synced.Identities, "Identities should be synced")
   638  		assert.True(c, status.Synced.Endpoints, "Endpoints should be synced")
   639  
   640  		assert.EqualValues(c, 2, status.NumNodes, "Incorrect number of nodes")
   641  		assert.EqualValues(c, 3, status.NumSharedServices, "Incorrect number of services")
   642  		assert.EqualValues(c, 1, status.NumIdentities, "Incorrect number of identities")
   643  		assert.EqualValues(c, 4, status.NumEndpoints, "Incorrect number of endpoints")
   644  	}, timeout, tick, "Reported status is not correct")
   645  }
   646  
   647  // mockClusterMesh is a mock implementation of the common.ClusterMesh interface
   648  // allowing for direct manipulation of the clusters
   649  type mockClusterMesh struct {
   650  	clusters map[string]*remoteCluster
   651  }
   652  
   653  // ForEachRemoteCluster is a mirrored implementation of ClusterMesh.ForEachRemoteCluster that operates on the mocked clusters.
   654  func (m *mockClusterMesh) ForEachRemoteCluster(fn func(common.RemoteCluster) error) error {
   655  	for _, cluster := range m.clusters {
   656  		if err := fn(cluster); err != nil {
   657  			return err
   658  		}
   659  	}
   660  	return nil
   661  }
   662  
   663  func (m *mockClusterMesh) NumReadyClusters() int {
   664  	return len(m.clusters)
   665  }
   666  
   667  func (m *mockClusterMesh) Start(cell.HookContext) error {
   668  	return nil
   669  }
   670  
   671  func (m *mockClusterMesh) Stop(cell.HookContext) error {
   672  	return nil
   673  }
   674  
   675  func TestRemoteClusterSync(t *testing.T) {
   676  	tests := []struct {
   677  		name    string
   678  		config  Config
   679  		connect bool
   680  		sync    bool
   681  	}{
   682  		{
   683  			name:    "remote cluster successfully syncs",
   684  			config:  DefaultConfig,
   685  			connect: true,
   686  			sync:    true,
   687  		},
   688  		{
   689  			name: "remote cluster fails to connect",
   690  			// use very low timeouts to speed up the test since we expect failures
   691  			config:  Config{PerClusterReadyTimeout: 1 * time.Millisecond, GlobalReadyTimeout: 1 * time.Millisecond},
   692  			connect: false,
   693  			sync:    false,
   694  		},
   695  		{
   696  			name: "remote cluster connects but fails to sync",
   697  			// use a low timeout only for global sync to avoid racing the connected signal
   698  			config:  Config{PerClusterReadyTimeout: 5 * time.Second, GlobalReadyTimeout: 1 * time.Millisecond},
   699  			connect: true,
   700  			sync:    false,
   701  		},
   702  	}
   703  
   704  	for _, tt := range tests {
   705  		t.Run(tt.name, func(t *testing.T) {
   706  			ctx, cancel := context.WithTimeout(context.Background(), timeout)
   707  			defer cancel()
   708  
   709  			mockClusterMesh := &mockClusterMesh{
   710  				clusters: make(map[string]*remoteCluster),
   711  			}
   712  			km := KVStoreMesh{
   713  				config: tt.config,
   714  				common: mockClusterMesh,
   715  				logger: logrus.New(),
   716  			}
   717  
   718  			rc := &remoteCluster{
   719  				name:         "foo",
   720  				synced:       newSynced(),
   721  				readyTimeout: tt.config.PerClusterReadyTimeout,
   722  				logger:       km.logger.WithField(logfields.ClusterName, "foo"),
   723  			}
   724  			rc.synced.resources.Add()
   725  			rc.synced.resources.Stop()
   726  
   727  			mockClusterMesh.clusters[rc.name] = rc
   728  
   729  			if tt.connect {
   730  				close(rc.synced.connected)
   731  			}
   732  
   733  			// trigger the readiness timeout
   734  			rc.waitForConnection(ctx)
   735  
   736  			clusterSyncComplete := func() bool {
   737  				select {
   738  				case <-rc.synced.resources.WaitChannel():
   739  					return true
   740  				default:
   741  					return false
   742  				}
   743  			}
   744  
   745  			if tt.connect {
   746  				require.False(t, clusterSyncComplete(), "Cluster sync should not be complete until all resources are done")
   747  				rc.synced.resources.Done()
   748  			}
   749  
   750  			require.NoError(t, rc.synced.Resources(ctx), "Still waiting for remote cluster resources")
   751  
   752  			ss := syncstate.SyncState{StoppableWaitGroup: lock.NewStoppableWaitGroup()}
   753  			require.False(t, ss.Complete())
   754  
   755  			markCompleted := ss.WaitForResource()
   756  			syncedCallback := func(ctx context.Context) {
   757  				markCompleted(ctx)
   758  				ss.Stop()
   759  			}
   760  
   761  			if !tt.sync {
   762  				// reset the cluster's synced object so we can simulate a resource never syncing
   763  				rc.synced = newSynced()
   764  				rc.synced.resources.Add()
   765  				rc.synced.resources.Stop()
   766  				require.ErrorIs(t, km.synced(ctx, syncedCallback), context.DeadlineExceeded, "Expected timeout waiting for sync")
   767  			} else {
   768  				require.NoError(t, km.synced(ctx, syncedCallback), "Sync should have completed")
   769  			}
   770  
   771  			require.True(t, ss.Complete(), "Global sync not completed")
   772  		})
   773  	}
   774  }