github.com/cilium/cilium@v1.16.2/pkg/clustermesh/operator/clustermesh.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package operator
     5  
     6  import (
     7  	"cmp"
     8  	"context"
     9  	"fmt"
    10  	"slices"
    11  	"sync"
    12  	"sync/atomic"
    13  
    14  	"github.com/cilium/hive/cell"
    15  	"github.com/sirupsen/logrus"
    16  
    17  	"github.com/cilium/cilium/api/v1/models"
    18  	"github.com/cilium/cilium/pkg/clustermesh/common"
    19  	"github.com/cilium/cilium/pkg/clustermesh/wait"
    20  	"github.com/cilium/cilium/pkg/kvstore/store"
    21  	"github.com/cilium/cilium/pkg/logging/logfields"
    22  	serviceStore "github.com/cilium/cilium/pkg/service/store"
    23  )
    24  
    25  // clusterMesh is a cache of multiple remote clusters
    26  type clusterMesh struct {
    27  	// common implements the common logic to connect to remote clusters.
    28  	common common.ClusterMesh
    29  
    30  	logger  logrus.FieldLogger
    31  	Metrics Metrics
    32  
    33  	// globalServices is a list of all global services. The datastructure
    34  	// is protected by its own mutex inside the structure.
    35  	globalServices *common.GlobalServiceCache
    36  
    37  	storeFactory store.Factory
    38  
    39  	started                   atomic.Bool
    40  	clusterAddHooks           []func(string)
    41  	clusterDeleteHooks        []func(string)
    42  	clusterServiceUpdateHooks []func(*serviceStore.ClusterService)
    43  	clusterServiceDeleteHooks []func(*serviceStore.ClusterService)
    44  
    45  	syncTimeoutConfig  wait.TimeoutConfig
    46  	syncTimeoutLogOnce sync.Once
    47  }
    48  
    49  // ClusterMesh is the interface corresponding to the clusterMesh struct to expose
    50  // its public methods to other Cilium packages.
    51  type ClusterMesh interface {
    52  	// RegisterClusterAddHook register a hook when a cluster is added to the mesh.
    53  	// This should NOT be called after the Start hook.
    54  	RegisterClusterAddHook(clusterAddHook func(string))
    55  	// RegisterClusterDeleteHook register a hook when a cluster is removed from the mesh.
    56  	// This should NOT be called after the Start hook.
    57  	RegisterClusterDeleteHook(clusterDeleteHook func(string))
    58  	// RegisterClusterServiceUpdateHook register a hook when a service in the mesh is updated.
    59  	// This should NOT be called after the Start hook.
    60  	RegisterClusterServiceUpdateHook(clusterServiceUpdateHook func(*serviceStore.ClusterService))
    61  	// RegisterClusterServiceDeleteHook register a hook when a service in the mesh is deleted.
    62  	// This should NOT be called after the Start hook.
    63  	RegisterClusterServiceDeleteHook(clusterServiceDeleteHook func(*serviceStore.ClusterService))
    64  
    65  	ServicesSynced(ctx context.Context) error
    66  	GlobalServices() *common.GlobalServiceCache
    67  }
    68  
    69  func newClusterMesh(lc cell.Lifecycle, params clusterMeshParams) (*clusterMesh, ClusterMesh) {
    70  	if params.ClusterMeshConfig == "" || !params.Cfg.ClusterMeshEnableEndpointSync {
    71  		return nil, nil
    72  	}
    73  
    74  	params.Logger.Info("Operator ClusterMesh component enabled")
    75  
    76  	cm := clusterMesh{
    77  		logger: params.Logger,
    78  		globalServices: common.NewGlobalServiceCache(
    79  			params.Metrics.TotalGlobalServices.WithLabelValues(params.ClusterInfo.Name),
    80  		),
    81  		storeFactory:      params.StoreFactory,
    82  		syncTimeoutConfig: params.TimeoutConfig,
    83  	}
    84  	cm.common = common.NewClusterMesh(common.Configuration{
    85  		Config:           params.Config,
    86  		ClusterInfo:      params.ClusterInfo,
    87  		NewRemoteCluster: cm.newRemoteCluster,
    88  		ServiceResolver:  params.ServiceResolver,
    89  		Metrics:          params.CommonMetrics,
    90  	})
    91  
    92  	lc.Append(cm.common)
    93  	lc.Append(&cm)
    94  	return &cm, &cm
    95  }
    96  
    97  // RegisterClusterAddHook register a hook when a cluster is added to the mesh.
    98  // This should NOT be called after the Start hook.
    99  func (cm *clusterMesh) RegisterClusterAddHook(clusterAddHook func(string)) {
   100  	if cm.started.Load() {
   101  		panic(fmt.Errorf("can't call RegisterClusterAddHook after the Start hook"))
   102  	}
   103  	cm.clusterAddHooks = append(cm.clusterAddHooks, clusterAddHook)
   104  }
   105  
   106  // RegisterClusterDeleteHook register a hook when a cluster is removed from the mesh.
   107  // This should NOT be called after the Start hook.
   108  func (cm *clusterMesh) RegisterClusterDeleteHook(clusterDeleteHook func(string)) {
   109  	if cm.started.Load() {
   110  		panic(fmt.Errorf("can't call RegisterClusterDeleteHook after the Start hook"))
   111  	}
   112  	cm.clusterDeleteHooks = append(cm.clusterDeleteHooks, clusterDeleteHook)
   113  }
   114  
   115  // RegisterClusterServiceUpdateHook register a hook when a service in the mesh is updated.
   116  // This should NOT be called after the Start hook.
   117  func (cm *clusterMesh) RegisterClusterServiceUpdateHook(clusterServiceUpdateHook func(*serviceStore.ClusterService)) {
   118  	if cm.started.Load() {
   119  		panic(fmt.Errorf("can't call RegisterClusterServiceUpdateHook after the Start hook"))
   120  	}
   121  	cm.clusterServiceUpdateHooks = append(cm.clusterServiceUpdateHooks, clusterServiceUpdateHook)
   122  }
   123  
   124  // RegisterClusterServiceDeleteHook register a hook when a service in the mesh is deleted.
   125  // This should NOT be called after the Start hook.
   126  func (cm *clusterMesh) RegisterClusterServiceDeleteHook(clusterServiceDeleteHook func(*serviceStore.ClusterService)) {
   127  	if cm.started.Load() {
   128  		panic(fmt.Errorf("can't call RegisterClusterServiceDeleteHook after the Start hook"))
   129  	}
   130  	cm.clusterServiceDeleteHooks = append(cm.clusterServiceDeleteHooks, clusterServiceDeleteHook)
   131  }
   132  
   133  func (cm *clusterMesh) GlobalServices() *common.GlobalServiceCache {
   134  	return cm.globalServices
   135  }
   136  
   137  func (cm *clusterMesh) newRemoteCluster(name string, status common.StatusFunc) common.RemoteCluster {
   138  	rc := &remoteCluster{
   139  		name:               name,
   140  		globalServices:     cm.globalServices,
   141  		storeFactory:       cm.storeFactory,
   142  		synced:             newSynced(),
   143  		status:             status,
   144  		clusterAddHooks:    cm.clusterAddHooks,
   145  		clusterDeleteHooks: cm.clusterDeleteHooks,
   146  	}
   147  
   148  	rc.remoteServices = cm.storeFactory.NewWatchStore(
   149  		name,
   150  		serviceStore.KeyCreator(
   151  			serviceStore.ClusterNameValidator(name),
   152  			serviceStore.NamespacedNameValidator(),
   153  		),
   154  		common.NewSharedServicesObserver(
   155  			cm.logger.WithField(logfields.ClusterName, name),
   156  			cm.globalServices,
   157  			func(svc *serviceStore.ClusterService) {
   158  				for _, hook := range cm.clusterServiceUpdateHooks {
   159  					hook(svc)
   160  				}
   161  			},
   162  			func(svc *serviceStore.ClusterService) {
   163  				for _, hook := range cm.clusterServiceDeleteHooks {
   164  					hook(svc)
   165  				}
   166  			},
   167  		),
   168  		store.RWSWithOnSyncCallback(func(ctx context.Context) { rc.synced.services.Stop() }),
   169  	)
   170  
   171  	return rc
   172  }
   173  
   174  func (cm *clusterMesh) Start(cell.HookContext) error {
   175  	cm.started.Store(true)
   176  	return nil
   177  }
   178  
   179  func (cm *clusterMesh) Stop(cell.HookContext) error {
   180  	return nil
   181  }
   182  
   183  // ServicesSynced returns after that either the initial list of shared services has
   184  // been received from all remote clusters, or the maximum wait period controlled by the
   185  // clustermesh-sync-timeout flag elapsed. It returns an error if the given context expired.
   186  func (cm *clusterMesh) ServicesSynced(ctx context.Context) error {
   187  	return cm.synced(ctx, func(rc *remoteCluster) wait.Fn { return rc.synced.Services })
   188  }
   189  
   190  func (cm *clusterMesh) synced(ctx context.Context, toWaitFn func(*remoteCluster) wait.Fn) error {
   191  	wctx, cancel := context.WithTimeout(ctx, cm.syncTimeoutConfig.Timeout())
   192  	defer cancel()
   193  
   194  	waiters := make([]wait.Fn, 0)
   195  	cm.common.ForEachRemoteCluster(func(rci common.RemoteCluster) error {
   196  		rc := rci.(*remoteCluster)
   197  		waiters = append(waiters, toWaitFn(rc))
   198  		return nil
   199  	})
   200  
   201  	err := wait.ForAll(wctx, waiters)
   202  	if ctx.Err() == nil && wctx.Err() != nil {
   203  		// The sync timeout expired, but the parent context is still valid, which
   204  		// means that the circuit breaker was triggered. Print a warning message
   205  		// and continue normally, as if the synchronization completed successfully.
   206  		// This ensures that we don't block forever in case of misconfigurations.
   207  		cm.syncTimeoutLogOnce.Do(func() {
   208  			cm.logger.Warning("Failed waiting for clustermesh synchronization, expect possible disruption of cross-cluster connections")
   209  		})
   210  
   211  		return nil
   212  	}
   213  
   214  	return err
   215  }
   216  
   217  // Status returns the status of the ClusterMesh subsystem
   218  func (cm *clusterMesh) status() []*models.RemoteCluster {
   219  	var clusters []*models.RemoteCluster
   220  
   221  	cm.common.ForEachRemoteCluster(func(rci common.RemoteCluster) error {
   222  		rc := rci.(*remoteCluster)
   223  		clusters = append(clusters, rc.Status())
   224  		return nil
   225  	})
   226  
   227  	// Sort the remote clusters information to ensure consistent ordering.
   228  	slices.SortFunc(clusters,
   229  		func(a, b *models.RemoteCluster) int { return cmp.Compare(a.Name, b.Name) })
   230  
   231  	return clusters
   232  }