github.com/cilium/cilium@v1.16.2/pkg/clustermesh/kvstoremesh/kvstoremesh_test.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package kvstoremesh 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "os" 11 "path" 12 "path/filepath" 13 "strings" 14 "sync" 15 "testing" 16 "time" 17 18 "github.com/cilium/hive/cell" 19 "github.com/cilium/hive/hivetest" 20 "github.com/sirupsen/logrus" 21 "github.com/stretchr/testify/assert" 22 "github.com/stretchr/testify/require" 23 baseclocktest "k8s.io/utils/clock/testing" 24 25 "github.com/cilium/cilium/api/v1/models" 26 "github.com/cilium/cilium/clustermesh-apiserver/syncstate" 27 "github.com/cilium/cilium/pkg/clustermesh/common" 28 "github.com/cilium/cilium/pkg/clustermesh/types" 29 "github.com/cilium/cilium/pkg/clustermesh/utils" 30 "github.com/cilium/cilium/pkg/hive" 31 "github.com/cilium/cilium/pkg/inctimer" 32 "github.com/cilium/cilium/pkg/kvstore" 33 "github.com/cilium/cilium/pkg/kvstore/store" 34 "github.com/cilium/cilium/pkg/lock" 35 "github.com/cilium/cilium/pkg/logging/logfields" 36 "github.com/cilium/cilium/pkg/promise" 37 "github.com/cilium/cilium/pkg/testutils" 38 ) 39 40 // Configure a generous timeout to prevent flakes when running in a noisy CI environment. 41 var ( 42 tick = 10 * time.Millisecond 43 timeout = 5 * time.Second 44 ) 45 46 type remoteEtcdClientWrapper struct { 47 kvstore.BackendOperations 48 name string 49 cached bool 50 51 kvs map[string]string 52 mu lock.Mutex 53 54 syncedCanariesWatched bool 55 } 56 57 // Override the ListAndWatch method so that we can propagate whatever event we want without key conflicts with 58 // those eventually created by kvstoremesh. Additionally, this also allows to track which prefixes have been watched. 59 func (w *remoteEtcdClientWrapper) ListAndWatch(ctx context.Context, prefix string, chanSize int) *kvstore.Watcher { 60 events := make(kvstore.EventChan, 10) 61 62 w.mu.Lock() 63 defer w.mu.Unlock() 64 65 if prefix == fmt.Sprintf("cilium/synced/%s/", w.name) { 66 state := "state" 67 if w.cached { 68 state = "cache" 69 } 70 71 w.syncedCanariesWatched = true 72 events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/nodes/v1", w.name, state)} 73 events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/services/v1", w.name, state)} 74 events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/identities/v1", w.name, state)} 75 events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: fmt.Sprintf("cilium/synced/%s/cilium/%s/ip/v1", w.name, state)} 76 } else { 77 for key, value := range w.kvs { 78 var found bool 79 if strings.HasPrefix(key, prefix) { 80 events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeCreate, Key: key, Value: []byte(value)} 81 found = true 82 delete(w.kvs, key) 83 } 84 85 if found { 86 events <- kvstore.KeyValueEvent{Typ: kvstore.EventTypeListDone} 87 } 88 } 89 } 90 91 go func() { 92 <-ctx.Done() 93 close(events) 94 }() 95 96 return &kvstore.Watcher{Events: events} 97 } 98 99 func clockAdvance(t assert.TestingT, fc *baseclocktest.FakeClock, d time.Duration) { 100 ctx, cancel := context.WithTimeout(context.Background(), timeout) 101 defer cancel() 102 103 timer, stop := inctimer.New() 104 defer stop() 105 106 for !fc.HasWaiters() { 107 select { 108 case <-ctx.Done(): 109 assert.FailNow(t, "Could not advance clock within expected timeout") 110 case <-timer.After(1 * time.Millisecond): 111 } 112 } 113 114 fc.Step(d) 115 } 116 117 func TestRemoteClusterRun(t *testing.T) { 118 testutils.IntegrationTest(t) 119 120 kvstore.SetupDummyWithConfigOpts(t, "etcd", 121 // Explicitly set higher QPS than the default to speedup the test 122 map[string]string{kvstore.EtcdRateLimitOption: "100"}, 123 ) 124 125 tests := []struct { 126 name string 127 srccfg types.CiliumClusterConfig 128 dstcfg types.CiliumClusterConfig 129 kvs map[string]string 130 }{ 131 { 132 name: "remote cluster has empty cluster config", 133 srccfg: types.CiliumClusterConfig{}, 134 dstcfg: types.CiliumClusterConfig{ 135 Capabilities: types.CiliumClusterConfigCapabilities{ 136 SyncedCanaries: true, 137 Cached: true, 138 }, 139 }, 140 kvs: map[string]string{ 141 "cilium/state/nodes/v1/foo/bar": "qux1", 142 "cilium/state/services/v1/foo/bar": "qux2", 143 "cilium/state/identities/v1/bar": "qux3", 144 "cilium/state/ip/v1/default/bar": "qux4", 145 }, 146 }, 147 { 148 name: "remote cluster supports the synced canaries", 149 srccfg: types.CiliumClusterConfig{ 150 Capabilities: types.CiliumClusterConfigCapabilities{ 151 SyncedCanaries: true, 152 }, 153 }, 154 dstcfg: types.CiliumClusterConfig{ 155 Capabilities: types.CiliumClusterConfigCapabilities{ 156 SyncedCanaries: true, 157 Cached: true, 158 }, 159 }, 160 kvs: map[string]string{ 161 "cilium/state/nodes/v1/foo/bar": "qux1", 162 "cilium/state/services/v1/foo/bar": "qux2", 163 "cilium/state/identities/v1/bar": "qux3", 164 "cilium/state/ip/v1/default/bar": "qux4", 165 }, 166 }, 167 { 168 name: "remote cluster supports the cached prefixes", 169 srccfg: types.CiliumClusterConfig{ 170 ID: 10, 171 Capabilities: types.CiliumClusterConfigCapabilities{ 172 Cached: true, 173 }, 174 }, 175 dstcfg: types.CiliumClusterConfig{ 176 ID: 10, 177 Capabilities: types.CiliumClusterConfigCapabilities{ 178 SyncedCanaries: true, 179 Cached: true, 180 }, 181 }, 182 kvs: map[string]string{ 183 "cilium/cache/nodes/v1/foo/bar": "qux1", 184 "cilium/cache/services/v1/foo/bar": "qux2", 185 "cilium/cache/identities/v1/foo/bar": "qux3", 186 "cilium/cache/ip/v1/foo/bar": "qux4", 187 }, 188 }, 189 { 190 name: "remote cluster supports both synced canaries and cached prefixes", 191 srccfg: types.CiliumClusterConfig{ 192 ID: 10, 193 Capabilities: types.CiliumClusterConfigCapabilities{ 194 SyncedCanaries: true, 195 Cached: true, 196 }, 197 }, 198 dstcfg: types.CiliumClusterConfig{ 199 ID: 10, 200 Capabilities: types.CiliumClusterConfigCapabilities{ 201 SyncedCanaries: true, 202 Cached: true, 203 }, 204 }, 205 kvs: map[string]string{ 206 "cilium/cache/nodes/v1/foo/bar": "qux1", 207 "cilium/cache/services/v1/foo/bar": "qux2", 208 "cilium/cache/identities/v1/foo/bar": "qux3", 209 "cilium/cache/ip/v1/foo/bar": "qux4", 210 }, 211 }, 212 } 213 214 for _, tt := range tests { 215 t.Run(tt.name, func(t *testing.T) { 216 var wg sync.WaitGroup 217 ctx, cancel := context.WithCancel(context.Background()) 218 219 t.Cleanup(func() { 220 cancel() 221 wg.Wait() 222 223 require.NoError(t, kvstore.Client().DeletePrefix(context.Background(), kvstore.BaseKeyPrefix)) 224 }) 225 226 remoteClient := &remoteEtcdClientWrapper{ 227 BackendOperations: kvstore.Client(), 228 name: "foo", 229 cached: tt.srccfg.Capabilities.Cached, 230 kvs: tt.kvs, 231 } 232 233 st := store.NewFactory(store.MetricsProvider()) 234 fakeclock := baseclocktest.NewFakeClock(time.Now()) 235 km := KVStoreMesh{backend: kvstore.Client(), storeFactory: st, logger: logrus.New(), clock: fakeclock} 236 237 rc := km.newRemoteCluster("foo", nil) 238 ready := make(chan error) 239 240 wg.Add(1) 241 go func() { 242 rc.Run(ctx, remoteClient, tt.srccfg, ready) 243 rc.Stop() 244 wg.Done() 245 }() 246 247 require.NoError(t, <-ready, "rc.Run() failed") 248 249 // Assert that the cluster config got properly propagated 250 require.EventuallyWithT(t, func(c *assert.CollectT) { 251 cfg, err := utils.GetClusterConfig(ctx, "foo", kvstore.Client()) 252 assert.NoError(c, err) 253 assert.Equal(c, tt.dstcfg, cfg) 254 }, timeout, tick, "Failed to retrieve the cluster config") 255 256 // Assert that the keys have been properly reflected 257 for key, value := range map[string]string{ 258 "cilium/cache/nodes/v1/foo/bar": "qux1", 259 "cilium/cache/services/v1/foo/bar": "qux2", 260 "cilium/cache/identities/v1/foo/bar": "qux3", 261 "cilium/cache/ip/v1/foo/bar": "qux4", 262 } { 263 require.EventuallyWithTf(t, func(c *assert.CollectT) { 264 v, err := kvstore.Client().Get(ctx, key) 265 assert.NoError(c, err) 266 assert.Equal(c, value, string(v)) 267 }, timeout, tick, "Expected key %q does not seem to have the correct value %q", key, value) 268 } 269 270 // Assert that the sync canaries have been properly set 271 for _, key := range []string{ 272 "cilium/synced/foo/cilium/cache/nodes/v1", 273 "cilium/synced/foo/cilium/cache/services/v1", 274 "cilium/synced/foo/cilium/cache/identities/v1", 275 "cilium/synced/foo/cilium/cache/ip/v1", 276 } { 277 require.EventuallyWithTf(t, func(c *assert.CollectT) { 278 v, err := kvstore.Client().Get(ctx, key) 279 assert.NoError(c, err) 280 assert.NotEmpty(c, string(v)) 281 }, timeout, tick, "Expected sync canary %q is not correctly present", key) 282 } 283 284 // Assert that synced canaries have been watched if expected 285 require.Equal(t, tt.srccfg.Capabilities.SyncedCanaries, remoteClient.syncedCanariesWatched) 286 287 cancel() 288 wg.Wait() 289 290 // rc.Remove waits for a 3 minutes grace period before proceeding 291 // with the deletion. Let's handle that by advancing the fake time. 292 go clockAdvance(t, fakeclock, 3*time.Minute) 293 294 // Assert that Remove() removes all keys previously created 295 rc.Remove(context.Background()) 296 297 pairs, err := kvstore.Client().ListPrefix(context.Background(), kvstore.BaseKeyPrefix) 298 require.NoError(t, err, "Failed to retrieve kvstore keys") 299 require.Empty(t, pairs, "Cached keys not correctly removed") 300 }) 301 } 302 } 303 304 type localClientWrapper struct { 305 kvstore.BackendOperations 306 errors map[string]uint 307 } 308 309 func (lcw *localClientWrapper) Delete(ctx context.Context, key string) error { 310 if cnt := lcw.errors[key]; cnt > 0 { 311 lcw.errors[key] = cnt - 1 312 return errors.New("fake error") 313 } 314 315 return lcw.BackendOperations.Delete(ctx, key) 316 } 317 318 func (lcw *localClientWrapper) DeletePrefix(ctx context.Context, path string) error { 319 if cnt := lcw.errors[path]; cnt > 0 { 320 lcw.errors[path] = cnt - 1 321 return errors.New("fake error") 322 } 323 324 return lcw.BackendOperations.DeletePrefix(ctx, path) 325 } 326 327 func TestRemoteClusterRemove(t *testing.T) { 328 testutils.IntegrationTest(t) 329 330 ctx := context.Background() 331 kvstore.SetupDummyWithConfigOpts(t, "etcd", 332 // Explicitly set higher QPS than the default to speedup the test 333 map[string]string{kvstore.EtcdRateLimitOption: "100"}, 334 ) 335 336 keys := func(name string) []string { 337 return []string{ 338 fmt.Sprintf("cilium/cluster-config/%s", name), 339 fmt.Sprintf("cilium/synced/%s/cilium/cache/nodes/v1", name), 340 fmt.Sprintf("cilium/synced/%s/cilium/cache/services/v1", name), 341 fmt.Sprintf("cilium/synced/%s/cilium/cache/identities/v1", name), 342 fmt.Sprintf("cilium/synced/%s/cilium/cache/ip/v1", name), 343 fmt.Sprintf("cilium/cache/nodes/v1/%s/bar", name), 344 fmt.Sprintf("cilium/cache/services/v1/%s/bar", name), 345 fmt.Sprintf("cilium/cache/identities/v1/%s/bar", name), 346 fmt.Sprintf("cilium/cache/ip/v1/%s/bar", name), 347 } 348 } 349 350 wrapper := &localClientWrapper{ 351 BackendOperations: kvstore.Client(), 352 errors: map[string]uint{ 353 "cilium/cache/identities/v1/foobar/": 1, 354 "cilium/cluster-config/baz": 10, 355 }, 356 } 357 358 st := store.NewFactory(store.MetricsProvider()) 359 fakeclock := baseclocktest.NewFakeClock(time.Now()) 360 km := KVStoreMesh{backend: wrapper, storeFactory: st, logger: logrus.New(), clock: fakeclock} 361 rcs := make(map[string]*remoteCluster) 362 for _, cluster := range []string{"foo", "foobar", "baz"} { 363 rcs[cluster] = km.newRemoteCluster(cluster, nil).(*remoteCluster) 364 rcs[cluster].Stop() 365 } 366 367 for _, rc := range rcs { 368 for _, key := range keys(rc.name) { 369 require.NoError(t, kvstore.Client().Update(ctx, key, []byte("value"), false)) 370 } 371 } 372 373 var wg sync.WaitGroup 374 bgrun := func(ctx context.Context, fn func(context.Context)) { 375 wg.Add(1) 376 go func() { 377 fn(ctx) 378 wg.Done() 379 }() 380 } 381 382 assertDeleted := func(t assert.TestingT, ctx context.Context, key string) { 383 value, err := kvstore.Client().Get(ctx, key) 384 assert.NoError(t, err, "Failed to retrieve kvstore key %s", key) 385 assert.Empty(t, string(value), "Key %s has not been deleted", key) 386 } 387 388 assertNotDeleted := func(t assert.TestingT, ctx context.Context, key string) { 389 value, err := kvstore.Client().Get(ctx, key) 390 assert.NoError(t, err, "Failed to retrieve kvstore key %s", key) 391 assert.NotEmpty(t, string(value), "Key %s has been incorrectly deleted", key) 392 } 393 394 // Remove should only delete the cluster config key before grace period expiration 395 bgrun(ctx, rcs["foo"].Remove) 396 assert.EventuallyWithT(t, func(c *assert.CollectT) { 397 assertDeleted(c, ctx, keys("foo")[0]) 398 for _, key := range keys("foo")[1:] { 399 assertNotDeleted(c, ctx, key) 400 } 401 }, timeout, tick) 402 403 clockAdvance(t, fakeclock, 3*time.Minute-1*time.Millisecond) 404 405 // Grace period should still not have expired 406 time.Sleep(tick) 407 for _, key := range keys("foo")[1:] { 408 assertNotDeleted(t, ctx, key) 409 } 410 411 clockAdvance(t, fakeclock, 1*time.Millisecond) 412 wg.Wait() 413 414 // Grace period expired, all keys should now have been deleted 415 for _, key := range keys("foo") { 416 assertDeleted(t, ctx, key) 417 } 418 419 // Keys of other clusters should not have been touched 420 for _, cluster := range []string{"foobar", "baz"} { 421 for _, key := range keys(cluster) { 422 assertNotDeleted(t, ctx, key) 423 } 424 } 425 426 // Simulate the failure of one of the delete calls 427 bgrun(ctx, rcs["foobar"].Remove) 428 429 clockAdvance(t, fakeclock, 3*time.Minute) 430 assert.EventuallyWithT(t, func(c *assert.CollectT) { 431 // Only the keys up to the erroring one should have been deleted 432 for _, key := range keys("foobar")[0:7] { 433 assertDeleted(c, ctx, key) 434 } 435 for _, key := range keys("foobar")[7:] { 436 assertNotDeleted(c, ctx, key) 437 } 438 }, timeout, tick) 439 440 clockAdvance(t, fakeclock, 2*time.Second-1*time.Millisecond) 441 time.Sleep(tick) 442 for _, key := range keys("foobar")[7:] { 443 // Backoff should not have expired yet 444 assertNotDeleted(t, ctx, key) 445 } 446 447 clockAdvance(t, fakeclock, 1*time.Millisecond) 448 wg.Wait() 449 450 for _, key := range keys("foobar") { 451 // Backoff expired, all keys should have been deleted 452 assertDeleted(t, ctx, key) 453 } 454 455 // Simulate the persistent failure of one of the delete calls 456 bgrun(ctx, rcs["baz"].Remove) 457 458 clockAdvance(t, fakeclock, 2*time.Second) // First retry 459 clockAdvance(t, fakeclock, 4*time.Second) // Second retry 460 clockAdvance(t, fakeclock, 8*time.Second) // Third retry 461 clockAdvance(t, fakeclock, 16*time.Second) // Forth retry 462 463 // Fifth and last retry 464 clockAdvance(t, fakeclock, 32*time.Second-1*time.Millisecond) 465 466 // Make sure that Remove() is still actually waiting. If it weren't, 467 // clockAdvance couldn't complete successfully. 468 clockAdvance(t, fakeclock, 1*time.Millisecond) 469 wg.Wait() 470 471 for _, key := range keys("baz") { 472 // All keys should not have been deleted due to the persistent error 473 assertNotDeleted(t, ctx, key) 474 } 475 476 // The context expired during grace period 477 cctx, cancel := context.WithCancel(context.Background()) 478 bgrun(cctx, rcs["foo"].Remove) 479 clockAdvance(t, fakeclock, 1*time.Minute) 480 cancel() 481 wg.Wait() 482 483 // Remove the existing waiter that we didn't clean-up due to context termination. 484 if fakeclock.HasWaiters() { 485 fakeclock.Step(5 * time.Minute) 486 } 487 488 // The context expired during backoff 489 cctx, cancel = context.WithCancel(context.Background()) 490 bgrun(cctx, rcs["baz"].Remove) 491 clockAdvance(t, fakeclock, 1*time.Minute) 492 cancel() 493 wg.Wait() 494 495 // Remove the existing waiter that we didn't clean-up due to context termination. 496 if fakeclock.HasWaiters() { 497 fakeclock.Step(5 * time.Minute) 498 } 499 } 500 501 func TestRemoteClusterRemoveShutdown(t *testing.T) { 502 // Test that KVStoreMesh shutdown process is not blocked by possible 503 // in-progress remote cluster removals. 504 testutils.IntegrationTest(t) 505 506 ctx := context.Background() 507 kvstore.SetupDummyWithConfigOpts(t, "etcd", 508 // Explicitly set higher QPS than the default to speedup the test 509 map[string]string{kvstore.EtcdRateLimitOption: "100"}, 510 ) 511 512 dir := t.TempDir() 513 cfg := []byte(fmt.Sprintf("endpoints:\n- %s\n", kvstore.EtcdDummyAddress())) 514 require.NoError(t, os.WriteFile(filepath.Join(dir, "remote"), cfg, 0644)) 515 516 // Let's manually create a fake cluster configuration for the remote cluster, 517 // because we are using the same kvstore. This will be used as a synchronization 518 // point to stop the hive while blocked waiting for the grace period. 519 require.NoError(t, utils.SetClusterConfig(ctx, "remote", types.CiliumClusterConfig{ID: 20}, kvstore.Client())) 520 521 var km *KVStoreMesh 522 h := hive.New( 523 Cell, 524 525 cell.Provide( 526 func() types.ClusterInfo { return types.ClusterInfo{ID: 10, Name: "local"} }, 527 func() Config { return Config{} }, 528 func() promise.Promise[kvstore.BackendOperations] { 529 clr, clp := promise.New[kvstore.BackendOperations]() 530 clr.Resolve(kvstore.Client()) 531 return clp 532 }, 533 ), 534 535 cell.Invoke(func(km_ *KVStoreMesh) { km = km_ }), 536 ) 537 hive.AddConfigOverride(h, func(cfg *common.Config) { cfg.ClusterMeshConfig = dir }) 538 539 tlog := hivetest.Logger(t) 540 require.NoError(t, h.Start(tlog, ctx), "Failed to start the hive") 541 542 // Wait until the connection has been successfully established, before disconnecting. 543 require.EventuallyWithT(t, func(c *assert.CollectT) { 544 status := km.status() 545 if assert.Len(c, status, 1) { 546 assert.True(c, status[0].Ready) 547 } 548 }, timeout, tick, "Failed to connect to the remote cluster") 549 550 require.NoError(t, os.Remove(filepath.Join(dir, "remote"))) 551 552 // Wait until the cluster config key has been removed, to ensure that we are 553 // actually waiting for the grace period expiration. 554 require.EventuallyWithT(t, func(c *assert.CollectT) { 555 key := path.Join(kvstore.ClusterConfigPrefix, "remote") 556 value, err := kvstore.Client().Get(ctx, key) 557 assert.NoError(c, err, "Failed to retrieve kvstore key %s", key) 558 assert.Empty(c, string(value), "Key %s has not been deleted", key) 559 }, timeout, tick) 560 561 sctx, cancel := context.WithTimeout(ctx, 1*time.Second) 562 defer cancel() 563 require.NoError(t, h.Stop(tlog, sctx), "Failed to stop the hive") 564 } 565 566 func TestRemoteClusterStatus(t *testing.T) { 567 testutils.IntegrationTest(t) 568 569 kvstore.SetupDummy(t, "etcd") 570 571 var wg sync.WaitGroup 572 ctx, cancel := context.WithCancel(context.Background()) 573 574 t.Cleanup(func() { 575 cancel() 576 wg.Wait() 577 578 require.NoError(t, kvstore.Client().DeletePrefix(context.Background(), kvstore.BaseKeyPrefix)) 579 }) 580 581 remoteClient := &remoteEtcdClientWrapper{ 582 BackendOperations: kvstore.Client(), 583 name: "foo", 584 kvs: map[string]string{ 585 "cilium/state/nodes/v1/foo/bar": "qux0", 586 "cilium/state/nodes/v1/foo/baz": "qux1", 587 "cilium/state/services/v1/foo/bar": "qux2", 588 "cilium/state/services/v1/foo/baz": "qux3", 589 "cilium/state/services/v1/foo/qux": "qux4", 590 "cilium/state/identities/v1/bar": "qux5", 591 "cilium/state/ip/v1/default/fred": "qux6", 592 "cilium/state/ip/v1/default/bar": "qux7", 593 "cilium/state/ip/v1/default/baz": "qux8", 594 "cilium/state/ip/v1/default/qux": "qux9", 595 }, 596 } 597 st := store.NewFactory(store.MetricsProvider()) 598 km := KVStoreMesh{backend: kvstore.Client(), storeFactory: st, logger: logrus.New()} 599 600 rc := km.newRemoteCluster("foo", func() *models.RemoteCluster { 601 return &models.RemoteCluster{Ready: true} 602 }) 603 cfg := types.CiliumClusterConfig{ 604 ID: 10, Capabilities: types.CiliumClusterConfigCapabilities{SyncedCanaries: true}, 605 } 606 ready := make(chan error) 607 608 // Validate the status before watching the remote cluster. 609 status := rc.(*remoteCluster).Status() 610 require.False(t, status.Ready, "Status should not be ready") 611 612 require.False(t, status.Synced.Nodes, "Nodes should not be synced") 613 require.False(t, status.Synced.Services, "Services should not be synced") 614 require.False(t, status.Synced.Identities, "Identities should not be synced") 615 require.False(t, status.Synced.Endpoints, "Endpoints should not be synced") 616 617 require.EqualValues(t, 0, status.NumNodes, "Incorrect number of nodes") 618 require.EqualValues(t, 0, status.NumSharedServices, "Incorrect number of services") 619 require.EqualValues(t, 0, status.NumIdentities, "Incorrect number of identities") 620 require.EqualValues(t, 0, status.NumEndpoints, "Incorrect number of endpoints") 621 622 wg.Add(1) 623 go func() { 624 rc.Run(ctx, remoteClient, cfg, ready) 625 rc.Stop() 626 wg.Done() 627 }() 628 629 require.NoError(t, <-ready, "rc.Run() failed") 630 631 require.EventuallyWithT(t, func(c *assert.CollectT) { 632 status := rc.(*remoteCluster).Status() 633 assert.True(c, status.Ready, "Status should be ready") 634 635 assert.True(c, status.Synced.Nodes, "Nodes should be synced") 636 assert.True(c, status.Synced.Services, "Services should be synced") 637 assert.True(c, status.Synced.Identities, "Identities should be synced") 638 assert.True(c, status.Synced.Endpoints, "Endpoints should be synced") 639 640 assert.EqualValues(c, 2, status.NumNodes, "Incorrect number of nodes") 641 assert.EqualValues(c, 3, status.NumSharedServices, "Incorrect number of services") 642 assert.EqualValues(c, 1, status.NumIdentities, "Incorrect number of identities") 643 assert.EqualValues(c, 4, status.NumEndpoints, "Incorrect number of endpoints") 644 }, timeout, tick, "Reported status is not correct") 645 } 646 647 // mockClusterMesh is a mock implementation of the common.ClusterMesh interface 648 // allowing for direct manipulation of the clusters 649 type mockClusterMesh struct { 650 clusters map[string]*remoteCluster 651 } 652 653 // ForEachRemoteCluster is a mirrored implementation of ClusterMesh.ForEachRemoteCluster that operates on the mocked clusters. 654 func (m *mockClusterMesh) ForEachRemoteCluster(fn func(common.RemoteCluster) error) error { 655 for _, cluster := range m.clusters { 656 if err := fn(cluster); err != nil { 657 return err 658 } 659 } 660 return nil 661 } 662 663 func (m *mockClusterMesh) NumReadyClusters() int { 664 return len(m.clusters) 665 } 666 667 func (m *mockClusterMesh) Start(cell.HookContext) error { 668 return nil 669 } 670 671 func (m *mockClusterMesh) Stop(cell.HookContext) error { 672 return nil 673 } 674 675 func TestRemoteClusterSync(t *testing.T) { 676 tests := []struct { 677 name string 678 config Config 679 connect bool 680 sync bool 681 }{ 682 { 683 name: "remote cluster successfully syncs", 684 config: DefaultConfig, 685 connect: true, 686 sync: true, 687 }, 688 { 689 name: "remote cluster fails to connect", 690 // use very low timeouts to speed up the test since we expect failures 691 config: Config{PerClusterReadyTimeout: 1 * time.Millisecond, GlobalReadyTimeout: 1 * time.Millisecond}, 692 connect: false, 693 sync: false, 694 }, 695 { 696 name: "remote cluster connects but fails to sync", 697 // use a low timeout only for global sync to avoid racing the connected signal 698 config: Config{PerClusterReadyTimeout: 5 * time.Second, GlobalReadyTimeout: 1 * time.Millisecond}, 699 connect: true, 700 sync: false, 701 }, 702 } 703 704 for _, tt := range tests { 705 t.Run(tt.name, func(t *testing.T) { 706 ctx, cancel := context.WithTimeout(context.Background(), timeout) 707 defer cancel() 708 709 mockClusterMesh := &mockClusterMesh{ 710 clusters: make(map[string]*remoteCluster), 711 } 712 km := KVStoreMesh{ 713 config: tt.config, 714 common: mockClusterMesh, 715 logger: logrus.New(), 716 } 717 718 rc := &remoteCluster{ 719 name: "foo", 720 synced: newSynced(), 721 readyTimeout: tt.config.PerClusterReadyTimeout, 722 logger: km.logger.WithField(logfields.ClusterName, "foo"), 723 } 724 rc.synced.resources.Add() 725 rc.synced.resources.Stop() 726 727 mockClusterMesh.clusters[rc.name] = rc 728 729 if tt.connect { 730 close(rc.synced.connected) 731 } 732 733 // trigger the readiness timeout 734 rc.waitForConnection(ctx) 735 736 clusterSyncComplete := func() bool { 737 select { 738 case <-rc.synced.resources.WaitChannel(): 739 return true 740 default: 741 return false 742 } 743 } 744 745 if tt.connect { 746 require.False(t, clusterSyncComplete(), "Cluster sync should not be complete until all resources are done") 747 rc.synced.resources.Done() 748 } 749 750 require.NoError(t, rc.synced.Resources(ctx), "Still waiting for remote cluster resources") 751 752 ss := syncstate.SyncState{StoppableWaitGroup: lock.NewStoppableWaitGroup()} 753 require.False(t, ss.Complete()) 754 755 markCompleted := ss.WaitForResource() 756 syncedCallback := func(ctx context.Context) { 757 markCompleted(ctx) 758 ss.Stop() 759 } 760 761 if !tt.sync { 762 // reset the cluster's synced object so we can simulate a resource never syncing 763 rc.synced = newSynced() 764 rc.synced.resources.Add() 765 rc.synced.resources.Stop() 766 require.ErrorIs(t, km.synced(ctx, syncedCallback), context.DeadlineExceeded, "Expected timeout waiting for sync") 767 } else { 768 require.NoError(t, km.synced(ctx, syncedCallback), "Sync should have completed") 769 } 770 771 require.True(t, ss.Complete(), "Global sync not completed") 772 }) 773 } 774 }