github.com/cilium/cilium@v1.16.2/pkg/clustermesh/common/clustermesh.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package common 5 6 import ( 7 "context" 8 "fmt" 9 "sync" 10 11 "github.com/cilium/hive/cell" 12 "github.com/spf13/pflag" 13 14 "github.com/cilium/cilium/api/v1/models" 15 "github.com/cilium/cilium/pkg/clustermesh/types" 16 "github.com/cilium/cilium/pkg/controller" 17 "github.com/cilium/cilium/pkg/dial" 18 "github.com/cilium/cilium/pkg/kvstore" 19 "github.com/cilium/cilium/pkg/lock" 20 "github.com/cilium/cilium/pkg/logging/logfields" 21 ) 22 23 const ( 24 // configNotificationsChannelSize is the size of the channel used to 25 // notify a clustermesh of configuration changes 26 configNotificationsChannelSize = 512 27 ) 28 29 type Config struct { 30 // ClusterMeshConfig is the path to the clustermesh configuration directory. 31 ClusterMeshConfig string 32 } 33 34 func (def Config) Flags(flags *pflag.FlagSet) { 35 flags.String("clustermesh-config", def.ClusterMeshConfig, "Path to the ClusterMesh configuration directory") 36 } 37 38 type StatusFunc func() *models.RemoteCluster 39 type RemoteClusterCreatorFunc func(name string, status StatusFunc) RemoteCluster 40 41 // Configuration is the configuration that must be provided to 42 // NewClusterMesh() 43 type Configuration struct { 44 Config 45 46 // ClusterInfo is the id/name of the local cluster. This is used for logging and metrics 47 ClusterInfo types.ClusterInfo 48 49 // NewRemoteCluster is a function returning a new implementation of the remote cluster business logic. 50 NewRemoteCluster RemoteClusterCreatorFunc 51 52 // nodeName is the name of the local node. This is used for logging and metrics 53 NodeName string 54 55 // ClusterSizeDependantInterval allows to calculate intervals based on cluster size. 56 ClusterSizeDependantInterval kvstore.ClusterSizeDependantIntervalFunc 57 58 // ServiceResolver, if not nil, is used to create a custom dialer for service resolution. 59 ServiceResolver *dial.ServiceResolver 60 61 // Metrics holds the different clustermesh metrics. 62 Metrics Metrics 63 } 64 65 type ClusterMesh interface { 66 cell.HookInterface 67 68 // ForEachRemoteCluster calls the provided function for each remote cluster 69 // in the ClusterMesh. 70 ForEachRemoteCluster(fn func(RemoteCluster) error) error 71 // NumReadyClusters returns the number of remote clusters to which a connection 72 // has been established 73 NumReadyClusters() int 74 } 75 76 // clusterMesh is a cache of multiple remote clusters 77 type clusterMesh struct { 78 // conf is the configuration, it is immutable after NewClusterMesh() 79 conf Configuration 80 81 mutex lock.RWMutex 82 wg sync.WaitGroup 83 84 clusters map[string]*remoteCluster 85 configWatcher *configDirectoryWatcher 86 87 // tombstones tracks the remote cluster configurations that have been removed, 88 // and whose cleanup process is being currently performed. This allows for 89 // asynchronously performing the appropriate tasks, while preventing the 90 // reconnection to the same cluster until the previously cleanup completed. 91 tombstones map[string]string 92 93 // rctx is a context that is used on cluster removal, to allow aborting 94 // the associated process if still running during shutdown (via rcancel). 95 rctx context.Context 96 rcancel context.CancelFunc 97 } 98 99 // NewClusterMesh creates a new remote cluster cache based on the 100 // provided configuration 101 func NewClusterMesh(c Configuration) ClusterMesh { 102 rctx, rcancel := context.WithCancel(context.Background()) 103 return &clusterMesh{ 104 conf: c, 105 clusters: map[string]*remoteCluster{}, 106 tombstones: map[string]string{}, 107 rctx: rctx, 108 rcancel: rcancel, 109 } 110 } 111 112 func (cm *clusterMesh) Start(cell.HookContext) error { 113 w, err := createConfigDirectoryWatcher(cm.conf.ClusterMeshConfig, cm) 114 if err != nil { 115 return fmt.Errorf("unable to create config directory watcher: %w", err) 116 } 117 118 cm.configWatcher = w 119 120 if err := cm.configWatcher.watch(); err != nil { 121 return fmt.Errorf("unable to start config directory watcher: %w", err) 122 } 123 124 return nil 125 } 126 127 // Close stops watching for remote cluster configuration files to appear and 128 // will close all connections to remote clusters 129 func (cm *clusterMesh) Stop(cell.HookContext) error { 130 if cm.configWatcher != nil { 131 cm.configWatcher.close() 132 } 133 134 // Wait until all in-progress removal processes have completed, if any. 135 // We must not hold the mutex at this point, as needed by the go routines. 136 cm.rcancel() 137 cm.wg.Wait() 138 139 cm.mutex.Lock() 140 defer cm.mutex.Unlock() 141 142 for name, cluster := range cm.clusters { 143 cluster.onStop() 144 delete(cm.clusters, name) 145 } 146 147 return nil 148 } 149 150 func (cm *clusterMesh) newRemoteCluster(name, path string) *remoteCluster { 151 rc := &remoteCluster{ 152 name: name, 153 configPath: path, 154 clusterSizeDependantInterval: cm.conf.ClusterSizeDependantInterval, 155 156 resolvers: func() []dial.Resolver { 157 if cm.conf.ServiceResolver != nil { 158 return []dial.Resolver{cm.conf.ServiceResolver} 159 } 160 return nil 161 }(), 162 163 changed: make(chan bool, configNotificationsChannelSize), 164 controllers: controller.NewManager(), 165 166 logger: log.WithField(logfields.ClusterName, name), 167 168 metricLastFailureTimestamp: cm.conf.Metrics.LastFailureTimestamp.WithLabelValues(cm.conf.ClusterInfo.Name, cm.conf.NodeName, name), 169 metricReadinessStatus: cm.conf.Metrics.ReadinessStatus.WithLabelValues(cm.conf.ClusterInfo.Name, cm.conf.NodeName, name), 170 metricTotalFailures: cm.conf.Metrics.TotalFailures.WithLabelValues(cm.conf.ClusterInfo.Name, cm.conf.NodeName, name), 171 } 172 173 rc.RemoteCluster = cm.conf.NewRemoteCluster(name, rc.status) 174 return rc 175 } 176 177 func (cm *clusterMesh) add(name, path string) { 178 if name == cm.conf.ClusterInfo.Name { 179 log.WithField(fieldClusterName, name).Debug("Ignoring configuration for own cluster") 180 return 181 } 182 183 if err := types.ValidateClusterName(name); err != nil { 184 log.WithField(fieldClusterName, name).WithError(err). 185 Error("Remote cluster name is invalid. The connection will be forbidden starting from Cilium v1.17") 186 } 187 188 cm.mutex.Lock() 189 defer cm.mutex.Unlock() 190 cm.addLocked(name, path) 191 } 192 193 func (cm *clusterMesh) addLocked(name, path string) { 194 if _, ok := cm.tombstones[name]; ok { 195 // The configuration for this cluster has been recreated before the cleanup 196 // of the same cluster completed. Let's queue it for delayed processing. 197 cm.tombstones[name] = path 198 log.WithField(fieldClusterName, name).Info("Delaying configuration of remote cluster, which is still being removed") 199 return 200 } 201 202 inserted := false 203 cluster, ok := cm.clusters[name] 204 if !ok { 205 cluster = cm.newRemoteCluster(name, path) 206 cm.clusters[name] = cluster 207 inserted = true 208 } 209 210 cm.conf.Metrics.TotalRemoteClusters.WithLabelValues(cm.conf.ClusterInfo.Name, cm.conf.NodeName).Set(float64(len(cm.clusters))) 211 212 if inserted { 213 cluster.onInsert() 214 } else { 215 // signal a change in configuration 216 cluster.changed <- true 217 } 218 } 219 220 func (cm *clusterMesh) remove(name string) { 221 const removed = "" 222 223 cm.mutex.Lock() 224 defer cm.mutex.Unlock() 225 226 cluster, ok := cm.clusters[name] 227 if !ok { 228 if _, alreadyRemoving := cm.tombstones[name]; alreadyRemoving { 229 // Reset possibly queued add events 230 cm.tombstones[name] = removed 231 } 232 233 return 234 } 235 236 cm.tombstones[name] = removed 237 delete(cm.clusters, name) 238 cm.conf.Metrics.TotalRemoteClusters.WithLabelValues(cm.conf.ClusterInfo.Name, cm.conf.NodeName).Set(float64(len(cm.clusters))) 239 240 cm.wg.Add(1) 241 go func() { 242 defer cm.wg.Done() 243 244 // Run onRemove in a separate go routing as potentially slow, to avoid 245 // blocking the processing of further events in the meanwhile. 246 cluster.onRemove(cm.rctx) 247 248 cm.mutex.Lock() 249 path := cm.tombstones[name] 250 delete(cm.tombstones, name) 251 252 if path != removed { 253 // Let's replay the queued add event. 254 log.WithField(fieldClusterName, name).Info("Replaying delayed configuration of new remote cluster after removal") 255 cm.addLocked(name, path) 256 } 257 cm.mutex.Unlock() 258 }() 259 260 log.WithField(fieldClusterName, name).Debug("Remote cluster configuration removed") 261 } 262 263 // NumReadyClusters returns the number of remote clusters to which a connection 264 // has been established 265 func (cm *clusterMesh) NumReadyClusters() int { 266 cm.mutex.RLock() 267 defer cm.mutex.RUnlock() 268 269 nready := 0 270 for _, cm := range cm.clusters { 271 if cm.isReady() { 272 nready++ 273 } 274 } 275 276 return nready 277 } 278 279 func (cm *clusterMesh) ForEachRemoteCluster(fn func(RemoteCluster) error) error { 280 cm.mutex.RLock() 281 defer cm.mutex.RUnlock() 282 283 for _, cluster := range cm.clusters { 284 if err := fn(cluster.RemoteCluster); err != nil { 285 return err 286 } 287 } 288 289 return nil 290 }