github.com/argoproj/argo-cd/v2@v2.10.9/controller/cache/cache.go (about) 1 package cache 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "math" 8 "net" 9 "net/url" 10 "os/exec" 11 "reflect" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 clustercache "github.com/argoproj/gitops-engine/pkg/cache" 18 "github.com/argoproj/gitops-engine/pkg/health" 19 "github.com/argoproj/gitops-engine/pkg/utils/kube" 20 log "github.com/sirupsen/logrus" 21 "golang.org/x/sync/semaphore" 22 v1 "k8s.io/api/core/v1" 23 kerrors "k8s.io/apimachinery/pkg/api/errors" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 26 "k8s.io/apimachinery/pkg/runtime/schema" 27 "k8s.io/apimachinery/pkg/watch" 28 "k8s.io/client-go/rest" 29 "k8s.io/client-go/tools/cache" 30 31 "github.com/argoproj/argo-cd/v2/controller/metrics" 32 "github.com/argoproj/argo-cd/v2/controller/sharding" 33 "github.com/argoproj/argo-cd/v2/pkg/apis/application" 34 appv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" 35 "github.com/argoproj/argo-cd/v2/util/argo" 36 "github.com/argoproj/argo-cd/v2/util/argo/normalizers" 37 "github.com/argoproj/argo-cd/v2/util/db" 38 "github.com/argoproj/argo-cd/v2/util/env" 39 logutils "github.com/argoproj/argo-cd/v2/util/log" 40 "github.com/argoproj/argo-cd/v2/util/lua" 41 "github.com/argoproj/argo-cd/v2/util/settings" 42 ) 43 44 const ( 45 // EnvClusterCacheResyncDuration is the env variable that holds cluster cache re-sync duration 46 EnvClusterCacheResyncDuration = "ARGOCD_CLUSTER_CACHE_RESYNC_DURATION" 47 48 // EnvClusterCacheWatchResyncDuration is the env variable that holds cluster cache watch re-sync duration 49 EnvClusterCacheWatchResyncDuration = "ARGOCD_CLUSTER_CACHE_WATCH_RESYNC_DURATION" 50 51 // EnvClusterSyncRetryTimeoutDuration is the env variable that holds cluster retry duration when sync error happens 52 EnvClusterSyncRetryTimeoutDuration = "ARGOCD_CLUSTER_SYNC_RETRY_TIMEOUT_DURATION" 53 54 // EnvClusterCacheListPageSize is the env variable to control size of the list page size when making K8s queries 55 EnvClusterCacheListPageSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_SIZE" 56 57 // EnvClusterCacheListPageBufferSize is the env variable to control the number of pages to buffer when making a K8s query to list resources 58 EnvClusterCacheListPageBufferSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_BUFFER_SIZE" 59 60 // EnvClusterCacheListSemaphore is the env variable to control size of the list semaphore 61 // This is used to limit the number of concurrent memory consuming operations on the 62 // k8s list queries results across all clusters to avoid memory spikes during cache initialization. 63 EnvClusterCacheListSemaphore = "ARGOCD_CLUSTER_CACHE_LIST_SEMAPHORE" 64 65 // EnvClusterCacheAttemptLimit is the env variable to control the retry limit for listing resources during cluster cache sync 66 EnvClusterCacheAttemptLimit = "ARGOCD_CLUSTER_CACHE_ATTEMPT_LIMIT" 67 68 // EnvClusterCacheRetryUseBackoff is the env variable to control whether to use a backoff strategy with the retry during cluster cache sync 69 EnvClusterCacheRetryUseBackoff = "ARGOCD_CLUSTER_CACHE_RETRY_USE_BACKOFF" 70 ) 71 72 // GitOps engine cluster cache tuning options 73 var ( 74 // clusterCacheResyncDuration controls the duration of cluster cache refresh. 75 // NOTE: this differs from gitops-engine default of 24h 76 clusterCacheResyncDuration = 12 * time.Hour 77 78 // clusterCacheWatchResyncDuration controls the maximum duration that group/kind watches are allowed to run 79 // for before relisting & restarting the watch 80 clusterCacheWatchResyncDuration = 10 * time.Minute 81 82 // clusterSyncRetryTimeoutDuration controls the sync retry duration when cluster sync error happens 83 clusterSyncRetryTimeoutDuration = 10 * time.Second 84 85 // The default limit of 50 is chosen based on experiments. 86 clusterCacheListSemaphoreSize int64 = 50 87 88 // clusterCacheListPageSize is the page size when performing K8s list requests. 89 // 500 is equal to kubectl's size 90 clusterCacheListPageSize int64 = 500 91 92 // clusterCacheListPageBufferSize is the number of pages to buffer when performing K8s list requests 93 clusterCacheListPageBufferSize int32 = 1 94 95 // clusterCacheRetryLimit sets a retry limit for failed requests during cluster cache sync 96 // If set to 1, retries are disabled. 97 clusterCacheAttemptLimit int32 = 1 98 99 // clusterCacheRetryUseBackoff specifies whether to use a backoff strategy on cluster cache sync, if retry is enabled 100 clusterCacheRetryUseBackoff bool = false 101 ) 102 103 func init() { 104 clusterCacheResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheResyncDuration, clusterCacheResyncDuration, 0, math.MaxInt64) 105 clusterCacheWatchResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheWatchResyncDuration, clusterCacheWatchResyncDuration, 0, math.MaxInt64) 106 clusterSyncRetryTimeoutDuration = env.ParseDurationFromEnv(EnvClusterSyncRetryTimeoutDuration, clusterSyncRetryTimeoutDuration, 0, math.MaxInt64) 107 clusterCacheListPageSize = env.ParseInt64FromEnv(EnvClusterCacheListPageSize, clusterCacheListPageSize, 0, math.MaxInt64) 108 clusterCacheListPageBufferSize = int32(env.ParseNumFromEnv(EnvClusterCacheListPageBufferSize, int(clusterCacheListPageBufferSize), 1, math.MaxInt32)) 109 clusterCacheListSemaphoreSize = env.ParseInt64FromEnv(EnvClusterCacheListSemaphore, clusterCacheListSemaphoreSize, 0, math.MaxInt64) 110 clusterCacheAttemptLimit = int32(env.ParseNumFromEnv(EnvClusterCacheAttemptLimit, int(clusterCacheAttemptLimit), 1, math.MaxInt32)) 111 clusterCacheRetryUseBackoff = env.ParseBoolFromEnv(EnvClusterCacheRetryUseBackoff, false) 112 } 113 114 type LiveStateCache interface { 115 // Returns k8s server version 116 GetVersionsInfo(serverURL string) (string, []kube.APIResourceInfo, error) 117 // Returns true of given group kind is a namespaced resource 118 IsNamespaced(server string, gk schema.GroupKind) (bool, error) 119 // Returns synced cluster cache 120 GetClusterCache(server string) (clustercache.ClusterCache, error) 121 // Executes give callback against resource specified by the key and all its children 122 IterateHierarchy(server string, key kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error 123 // Returns state of live nodes which correspond for target nodes of specified application. 124 GetManagedLiveObjs(a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error) 125 // IterateResources iterates all resource stored in cache 126 IterateResources(server string, callback func(res *clustercache.Resource, info *ResourceInfo)) error 127 // Returns all top level resources (resources without owner references) of a specified namespace 128 GetNamespaceTopLevelResources(server string, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error) 129 // Starts watching resources of each controlled cluster. 130 Run(ctx context.Context) error 131 // Returns information about monitored clusters 132 GetClustersInfo() []clustercache.ClusterInfo 133 // Init must be executed before cache can be used 134 Init() error 135 } 136 137 type ObjectUpdatedHandler = func(managedByApp map[string]bool, ref v1.ObjectReference) 138 139 type PodInfo struct { 140 NodeName string 141 ResourceRequests v1.ResourceList 142 Phase v1.PodPhase 143 } 144 145 type NodeInfo struct { 146 Name string 147 Capacity v1.ResourceList 148 SystemInfo v1.NodeSystemInfo 149 } 150 151 type ResourceInfo struct { 152 Info []appv1.InfoItem 153 AppName string 154 Images []string 155 Health *health.HealthStatus 156 // NetworkingInfo are available only for known types involved into networking: Ingress, Service, Pod 157 NetworkingInfo *appv1.ResourceNetworkingInfo 158 // PodInfo is available for pods only 159 PodInfo *PodInfo 160 // NodeInfo is available for nodes only 161 NodeInfo *NodeInfo 162 163 manifestHash string 164 } 165 166 func NewLiveStateCache( 167 db db.ArgoDB, 168 appInformer cache.SharedIndexInformer, 169 settingsMgr *settings.SettingsManager, 170 kubectl kube.Kubectl, 171 metricsServer *metrics.MetricsServer, 172 onObjectUpdated ObjectUpdatedHandler, 173 clusterSharding sharding.ClusterShardingCache, 174 resourceTracking argo.ResourceTracking) LiveStateCache { 175 176 return &liveStateCache{ 177 appInformer: appInformer, 178 db: db, 179 clusters: make(map[string]clustercache.ClusterCache), 180 onObjectUpdated: onObjectUpdated, 181 kubectl: kubectl, 182 settingsMgr: settingsMgr, 183 metricsServer: metricsServer, 184 clusterSharding: clusterSharding, 185 resourceTracking: resourceTracking, 186 } 187 } 188 189 type cacheSettings struct { 190 clusterSettings clustercache.Settings 191 appInstanceLabelKey string 192 trackingMethod appv1.TrackingMethod 193 // resourceOverrides provides a list of ignored differences to ignore watched resource updates 194 resourceOverrides map[string]appv1.ResourceOverride 195 196 // ignoreResourceUpdates is a flag to enable resource-ignore rules. 197 ignoreResourceUpdatesEnabled bool 198 } 199 200 type liveStateCache struct { 201 db db.ArgoDB 202 appInformer cache.SharedIndexInformer 203 onObjectUpdated ObjectUpdatedHandler 204 kubectl kube.Kubectl 205 settingsMgr *settings.SettingsManager 206 metricsServer *metrics.MetricsServer 207 clusterSharding sharding.ClusterShardingCache 208 resourceTracking argo.ResourceTracking 209 ignoreNormalizerOpts normalizers.IgnoreNormalizerOpts 210 211 clusters map[string]clustercache.ClusterCache 212 cacheSettings cacheSettings 213 lock sync.RWMutex 214 } 215 216 func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) { 217 appInstanceLabelKey, err := c.settingsMgr.GetAppInstanceLabelKey() 218 if err != nil { 219 return nil, err 220 } 221 resourceUpdatesOverrides, err := c.settingsMgr.GetIgnoreResourceUpdatesOverrides() 222 if err != nil { 223 return nil, err 224 } 225 ignoreResourceUpdatesEnabled, err := c.settingsMgr.GetIsIgnoreResourceUpdatesEnabled() 226 if err != nil { 227 return nil, err 228 } 229 resourcesFilter, err := c.settingsMgr.GetResourcesFilter() 230 if err != nil { 231 return nil, err 232 } 233 resourceOverrides, err := c.settingsMgr.GetResourceOverrides() 234 if err != nil { 235 return nil, err 236 } 237 clusterSettings := clustercache.Settings{ 238 ResourceHealthOverride: lua.ResourceHealthOverrides(resourceOverrides), 239 ResourcesFilter: resourcesFilter, 240 } 241 242 return &cacheSettings{clusterSettings, appInstanceLabelKey, argo.GetTrackingMethod(c.settingsMgr), resourceUpdatesOverrides, ignoreResourceUpdatesEnabled}, nil 243 } 244 245 func asResourceNode(r *clustercache.Resource) appv1.ResourceNode { 246 gv, err := schema.ParseGroupVersion(r.Ref.APIVersion) 247 if err != nil { 248 gv = schema.GroupVersion{} 249 } 250 parentRefs := make([]appv1.ResourceRef, len(r.OwnerRefs)) 251 for i, ownerRef := range r.OwnerRefs { 252 ownerGvk := schema.FromAPIVersionAndKind(ownerRef.APIVersion, ownerRef.Kind) 253 ownerKey := kube.NewResourceKey(ownerGvk.Group, ownerRef.Kind, r.Ref.Namespace, ownerRef.Name) 254 parentRefs[i] = appv1.ResourceRef{Name: ownerRef.Name, Kind: ownerKey.Kind, Namespace: r.Ref.Namespace, Group: ownerKey.Group, UID: string(ownerRef.UID)} 255 } 256 var resHealth *appv1.HealthStatus 257 resourceInfo := resInfo(r) 258 if resourceInfo.Health != nil { 259 resHealth = &appv1.HealthStatus{Status: resourceInfo.Health.Status, Message: resourceInfo.Health.Message} 260 } 261 return appv1.ResourceNode{ 262 ResourceRef: appv1.ResourceRef{ 263 UID: string(r.Ref.UID), 264 Name: r.Ref.Name, 265 Group: gv.Group, 266 Version: gv.Version, 267 Kind: r.Ref.Kind, 268 Namespace: r.Ref.Namespace, 269 }, 270 ParentRefs: parentRefs, 271 Info: resourceInfo.Info, 272 ResourceVersion: r.ResourceVersion, 273 NetworkingInfo: resourceInfo.NetworkingInfo, 274 Images: resourceInfo.Images, 275 Health: resHealth, 276 CreatedAt: r.CreationTimestamp, 277 } 278 } 279 280 func resInfo(r *clustercache.Resource) *ResourceInfo { 281 info, ok := r.Info.(*ResourceInfo) 282 if !ok || info == nil { 283 info = &ResourceInfo{} 284 } 285 return info 286 } 287 288 func isRootAppNode(r *clustercache.Resource) bool { 289 return resInfo(r).AppName != "" && len(r.OwnerRefs) == 0 290 } 291 292 func getApp(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource) string { 293 return getAppRecursive(r, ns, map[kube.ResourceKey]bool{}) 294 } 295 296 func ownerRefGV(ownerRef metav1.OwnerReference) schema.GroupVersion { 297 gv, err := schema.ParseGroupVersion(ownerRef.APIVersion) 298 if err != nil { 299 gv = schema.GroupVersion{} 300 } 301 return gv 302 } 303 304 func getAppRecursive(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource, visited map[kube.ResourceKey]bool) string { 305 if !visited[r.ResourceKey()] { 306 visited[r.ResourceKey()] = true 307 } else { 308 log.Warnf("Circular dependency detected: %v.", visited) 309 return resInfo(r).AppName 310 } 311 312 if resInfo(r).AppName != "" { 313 return resInfo(r).AppName 314 } 315 for _, ownerRef := range r.OwnerRefs { 316 gv := ownerRefGV(ownerRef) 317 if parent, ok := ns[kube.NewResourceKey(gv.Group, ownerRef.Kind, r.Ref.Namespace, ownerRef.Name)]; ok { 318 app := getAppRecursive(parent, ns, visited) 319 if app != "" { 320 return app 321 } 322 } 323 } 324 return "" 325 } 326 327 var ( 328 ignoredRefreshResources = map[string]bool{ 329 "/" + kube.EndpointsKind: true, 330 } 331 ) 332 333 // skipAppRequeuing checks if the object is an API type which we want to skip requeuing against. 334 // We ignore API types which have a high churn rate, and/or whose updates are irrelevant to the app 335 func skipAppRequeuing(key kube.ResourceKey) bool { 336 return ignoredRefreshResources[key.Group+"/"+key.Kind] 337 } 338 339 func skipResourceUpdate(oldInfo, newInfo *ResourceInfo) bool { 340 if oldInfo == nil || newInfo == nil { 341 return false 342 } 343 isSameHealthStatus := (oldInfo.Health == nil && newInfo.Health == nil) || oldInfo.Health != nil && newInfo.Health != nil && oldInfo.Health.Status == newInfo.Health.Status 344 isSameManifest := oldInfo.manifestHash != "" && newInfo.manifestHash != "" && oldInfo.manifestHash == newInfo.manifestHash 345 return isSameHealthStatus && isSameManifest 346 } 347 348 // shouldHashManifest validates if the API resource needs to be hashed. 349 // If there's an app name from resource tracking, or if this is itself an app, we should generate a hash. 350 // Otherwise, the hashing should be skipped to save CPU time. 351 func shouldHashManifest(appName string, gvk schema.GroupVersionKind) bool { 352 // Only hash if the resource belongs to an app. 353 // Best - Only hash for resources that are part of an app or their dependencies 354 // (current) - Only hash for resources that are part of an app + all apps that might be from an ApplicationSet 355 // Orphan - If orphan is enabled, hash should be made on all resource of that namespace and a config to disable it 356 // Worst - Hash all resources watched by Argo 357 return appName != "" || (gvk.Group == application.Group && gvk.Kind == application.ApplicationKind) 358 } 359 360 // isRetryableError is a helper method to see whether an error 361 // returned from the dynamic client is potentially retryable. 362 func isRetryableError(err error) bool { 363 if err == nil { 364 return false 365 } 366 return kerrors.IsInternalError(err) || 367 kerrors.IsInvalid(err) || 368 kerrors.IsTooManyRequests(err) || 369 kerrors.IsServerTimeout(err) || 370 kerrors.IsServiceUnavailable(err) || 371 kerrors.IsTimeout(err) || 372 kerrors.IsUnexpectedObjectError(err) || 373 kerrors.IsUnexpectedServerError(err) || 374 isResourceQuotaConflictErr(err) || 375 isTransientNetworkErr(err) || 376 isExceededQuotaErr(err) || 377 errors.Is(err, syscall.ECONNRESET) 378 } 379 380 func isExceededQuotaErr(err error) bool { 381 return kerrors.IsForbidden(err) && strings.Contains(err.Error(), "exceeded quota") 382 } 383 384 func isResourceQuotaConflictErr(err error) bool { 385 return kerrors.IsConflict(err) && strings.Contains(err.Error(), "Operation cannot be fulfilled on resourcequota") 386 } 387 388 func isTransientNetworkErr(err error) bool { 389 switch err.(type) { 390 case net.Error: 391 switch err.(type) { 392 case *net.DNSError, *net.OpError, net.UnknownNetworkError: 393 return true 394 case *url.Error: 395 // For a URL error, where it replies "connection closed" 396 // retry again. 397 return strings.Contains(err.Error(), "Connection closed by foreign host") 398 } 399 } 400 401 errorString := err.Error() 402 if exitErr, ok := err.(*exec.ExitError); ok { 403 errorString = fmt.Sprintf("%s %s", errorString, exitErr.Stderr) 404 } 405 if strings.Contains(errorString, "net/http: TLS handshake timeout") || 406 strings.Contains(errorString, "i/o timeout") || 407 strings.Contains(errorString, "connection timed out") || 408 strings.Contains(errorString, "connection reset by peer") { 409 return true 410 } 411 return false 412 } 413 414 func (c *liveStateCache) getCluster(server string) (clustercache.ClusterCache, error) { 415 c.lock.RLock() 416 clusterCache, ok := c.clusters[server] 417 cacheSettings := c.cacheSettings 418 c.lock.RUnlock() 419 420 if ok { 421 return clusterCache, nil 422 } 423 424 c.lock.Lock() 425 defer c.lock.Unlock() 426 427 clusterCache, ok = c.clusters[server] 428 if ok { 429 return clusterCache, nil 430 } 431 432 cluster, err := c.db.GetCluster(context.Background(), server) 433 if err != nil { 434 return nil, fmt.Errorf("error getting cluster: %w", err) 435 } 436 437 if !c.canHandleCluster(cluster) { 438 return nil, fmt.Errorf("controller is configured to ignore cluster %s", cluster.Server) 439 } 440 441 resourceCustomLabels, err := c.settingsMgr.GetResourceCustomLabels() 442 if err != nil { 443 return nil, fmt.Errorf("error getting custom label: %w", err) 444 } 445 446 respectRBAC, err := c.settingsMgr.RespectRBAC() 447 if err != nil { 448 return nil, fmt.Errorf("error getting value for %v: %w", settings.RespectRBAC, err) 449 } 450 451 clusterCacheConfig := cluster.RESTConfig() 452 // Controller dynamically fetches all resource types available on the cluster 453 // using a discovery API that may contain deprecated APIs. 454 // This causes log flooding when managing a large number of clusters. 455 // https://github.com/argoproj/argo-cd/issues/11973 456 // However, we can safely suppress deprecation warnings 457 // because we do not rely on resources with a particular API group or version. 458 // https://kubernetes.io/blog/2020/09/03/warnings/#customize-client-handling 459 // 460 // Completely suppress warning logs only for log levels that are less than Debug. 461 if log.GetLevel() < log.DebugLevel { 462 clusterCacheConfig.WarningHandler = rest.NoWarnings{} 463 } 464 465 clusterCacheOpts := []clustercache.UpdateSettingsFunc{ 466 clustercache.SetListSemaphore(semaphore.NewWeighted(clusterCacheListSemaphoreSize)), 467 clustercache.SetListPageSize(clusterCacheListPageSize), 468 clustercache.SetListPageBufferSize(clusterCacheListPageBufferSize), 469 clustercache.SetWatchResyncTimeout(clusterCacheWatchResyncDuration), 470 clustercache.SetClusterSyncRetryTimeout(clusterSyncRetryTimeoutDuration), 471 clustercache.SetResyncTimeout(clusterCacheResyncDuration), 472 clustercache.SetSettings(cacheSettings.clusterSettings), 473 clustercache.SetNamespaces(cluster.Namespaces), 474 clustercache.SetClusterResources(cluster.ClusterResources), 475 clustercache.SetPopulateResourceInfoHandler(func(un *unstructured.Unstructured, isRoot bool) (interface{}, bool) { 476 res := &ResourceInfo{} 477 populateNodeInfo(un, res, resourceCustomLabels) 478 c.lock.RLock() 479 cacheSettings := c.cacheSettings 480 c.lock.RUnlock() 481 482 res.Health, _ = health.GetResourceHealth(un, cacheSettings.clusterSettings.ResourceHealthOverride) 483 484 appName := c.resourceTracking.GetAppName(un, cacheSettings.appInstanceLabelKey, cacheSettings.trackingMethod) 485 if isRoot && appName != "" { 486 res.AppName = appName 487 } 488 489 gvk := un.GroupVersionKind() 490 491 if cacheSettings.ignoreResourceUpdatesEnabled && shouldHashManifest(appName, gvk) { 492 hash, err := generateManifestHash(un, nil, cacheSettings.resourceOverrides, c.ignoreNormalizerOpts) 493 if err != nil { 494 log.Errorf("Failed to generate manifest hash: %v", err) 495 } else { 496 res.manifestHash = hash 497 } 498 } 499 500 // edge case. we do not label CRDs, so they miss the tracking label we inject. But we still 501 // want the full resource to be available in our cache (to diff), so we store all CRDs 502 return res, res.AppName != "" || gvk.Kind == kube.CustomResourceDefinitionKind 503 }), 504 clustercache.SetLogr(logutils.NewLogrusLogger(log.WithField("server", cluster.Server))), 505 clustercache.SetRetryOptions(clusterCacheAttemptLimit, clusterCacheRetryUseBackoff, isRetryableError), 506 clustercache.SetRespectRBAC(respectRBAC), 507 } 508 509 clusterCache = clustercache.NewClusterCache(clusterCacheConfig, clusterCacheOpts...) 510 511 _ = clusterCache.OnResourceUpdated(func(newRes *clustercache.Resource, oldRes *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) { 512 toNotify := make(map[string]bool) 513 var ref v1.ObjectReference 514 if newRes != nil { 515 ref = newRes.Ref 516 } else { 517 ref = oldRes.Ref 518 } 519 520 c.lock.RLock() 521 cacheSettings := c.cacheSettings 522 c.lock.RUnlock() 523 524 if cacheSettings.ignoreResourceUpdatesEnabled && oldRes != nil && newRes != nil && skipResourceUpdate(resInfo(oldRes), resInfo(newRes)) { 525 // Additional check for debug level so we don't need to evaluate the 526 // format string in case of non-debug scenarios 527 if log.GetLevel() >= log.DebugLevel { 528 namespace := ref.Namespace 529 if ref.Namespace == "" { 530 namespace = "(cluster-scoped)" 531 } 532 log.WithFields(log.Fields{ 533 "server": cluster.Server, 534 "namespace": namespace, 535 "name": ref.Name, 536 "api-version": ref.APIVersion, 537 "kind": ref.Kind, 538 }).Debug("Ignoring change of object because none of the watched resource fields have changed") 539 } 540 return 541 } 542 543 for _, r := range []*clustercache.Resource{newRes, oldRes} { 544 if r == nil { 545 continue 546 } 547 app := getApp(r, namespaceResources) 548 if app == "" || skipAppRequeuing(r.ResourceKey()) { 549 continue 550 } 551 toNotify[app] = isRootAppNode(r) || toNotify[app] 552 } 553 c.onObjectUpdated(toNotify, ref) 554 }) 555 556 _ = clusterCache.OnEvent(func(event watch.EventType, un *unstructured.Unstructured) { 557 gvk := un.GroupVersionKind() 558 c.metricsServer.IncClusterEventsCount(cluster.Server, gvk.Group, gvk.Kind) 559 }) 560 561 c.clusters[server] = clusterCache 562 563 return clusterCache, nil 564 } 565 566 func (c *liveStateCache) getSyncedCluster(server string) (clustercache.ClusterCache, error) { 567 clusterCache, err := c.getCluster(server) 568 if err != nil { 569 return nil, fmt.Errorf("error getting cluster: %w", err) 570 } 571 err = clusterCache.EnsureSynced() 572 if err != nil { 573 return nil, fmt.Errorf("error synchronizing cache state : %w", err) 574 } 575 return clusterCache, nil 576 } 577 578 func (c *liveStateCache) invalidate(cacheSettings cacheSettings) { 579 log.Info("invalidating live state cache") 580 c.lock.Lock() 581 c.cacheSettings = cacheSettings 582 clusters := c.clusters 583 c.lock.Unlock() 584 585 for _, clust := range clusters { 586 clust.Invalidate(clustercache.SetSettings(cacheSettings.clusterSettings)) 587 } 588 log.Info("live state cache invalidated") 589 } 590 591 func (c *liveStateCache) IsNamespaced(server string, gk schema.GroupKind) (bool, error) { 592 clusterInfo, err := c.getSyncedCluster(server) 593 if err != nil { 594 return false, err 595 } 596 return clusterInfo.IsNamespaced(gk) 597 } 598 599 func (c *liveStateCache) IterateHierarchy(server string, key kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error { 600 clusterInfo, err := c.getSyncedCluster(server) 601 if err != nil { 602 return err 603 } 604 clusterInfo.IterateHierarchy(key, func(resource *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) bool { 605 return action(asResourceNode(resource), getApp(resource, namespaceResources)) 606 }) 607 return nil 608 } 609 610 func (c *liveStateCache) IterateResources(server string, callback func(res *clustercache.Resource, info *ResourceInfo)) error { 611 clusterInfo, err := c.getSyncedCluster(server) 612 if err != nil { 613 return err 614 } 615 _ = clusterInfo.FindResources("", func(r *clustercache.Resource) bool { 616 if info, ok := r.Info.(*ResourceInfo); ok { 617 callback(r, info) 618 } 619 return false 620 }) 621 return nil 622 } 623 624 func (c *liveStateCache) GetNamespaceTopLevelResources(server string, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error) { 625 clusterInfo, err := c.getSyncedCluster(server) 626 if err != nil { 627 return nil, err 628 } 629 resources := clusterInfo.FindResources(namespace, clustercache.TopLevelResource) 630 res := make(map[kube.ResourceKey]appv1.ResourceNode) 631 for k, r := range resources { 632 res[k] = asResourceNode(r) 633 } 634 return res, nil 635 } 636 637 func (c *liveStateCache) GetManagedLiveObjs(a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error) { 638 clusterInfo, err := c.getSyncedCluster(a.Spec.Destination.Server) 639 if err != nil { 640 return nil, fmt.Errorf("failed to get cluster info for %q: %w", a.Spec.Destination.Server, err) 641 } 642 return clusterInfo.GetManagedLiveObjs(targetObjs, func(r *clustercache.Resource) bool { 643 return resInfo(r).AppName == a.InstanceName(c.settingsMgr.GetNamespace()) 644 }) 645 } 646 647 func (c *liveStateCache) GetVersionsInfo(serverURL string) (string, []kube.APIResourceInfo, error) { 648 clusterInfo, err := c.getSyncedCluster(serverURL) 649 if err != nil { 650 return "", nil, fmt.Errorf("failed to get cluster info for %q: %w", serverURL, err) 651 } 652 return clusterInfo.GetServerVersion(), clusterInfo.GetAPIResources(), nil 653 } 654 655 func (c *liveStateCache) isClusterHasApps(apps []interface{}, cluster *appv1.Cluster) bool { 656 for _, obj := range apps { 657 app, ok := obj.(*appv1.Application) 658 if !ok { 659 continue 660 } 661 err := argo.ValidateDestination(context.Background(), &app.Spec.Destination, c.db) 662 if err != nil { 663 continue 664 } 665 if app.Spec.Destination.Server == cluster.Server { 666 return true 667 } 668 } 669 return false 670 } 671 672 func (c *liveStateCache) watchSettings(ctx context.Context) { 673 updateCh := make(chan *settings.ArgoCDSettings, 1) 674 c.settingsMgr.Subscribe(updateCh) 675 676 done := false 677 for !done { 678 select { 679 case <-updateCh: 680 nextCacheSettings, err := c.loadCacheSettings() 681 if err != nil { 682 log.Warnf("Failed to read updated settings: %v", err) 683 continue 684 } 685 686 c.lock.Lock() 687 needInvalidate := false 688 if !reflect.DeepEqual(c.cacheSettings, *nextCacheSettings) { 689 c.cacheSettings = *nextCacheSettings 690 needInvalidate = true 691 } 692 c.lock.Unlock() 693 if needInvalidate { 694 c.invalidate(*nextCacheSettings) 695 } 696 case <-ctx.Done(): 697 done = true 698 } 699 } 700 log.Info("shutting down settings watch") 701 c.settingsMgr.Unsubscribe(updateCh) 702 close(updateCh) 703 } 704 705 func (c *liveStateCache) Init() error { 706 cacheSettings, err := c.loadCacheSettings() 707 if err != nil { 708 return fmt.Errorf("error loading cache settings: %w", err) 709 } 710 c.cacheSettings = *cacheSettings 711 return nil 712 } 713 714 // Run watches for resource changes annotated with application label on all registered clusters and schedule corresponding app refresh. 715 func (c *liveStateCache) Run(ctx context.Context) error { 716 go c.watchSettings(ctx) 717 718 kube.RetryUntilSucceed(ctx, clustercache.ClusterRetryTimeout, "watch clusters", logutils.NewLogrusLogger(logutils.NewWithCurrentConfig()), func() error { 719 return c.db.WatchClusters(ctx, c.handleAddEvent, c.handleModEvent, c.handleDeleteEvent) 720 }) 721 722 <-ctx.Done() 723 c.invalidate(c.cacheSettings) 724 return nil 725 } 726 727 func (c *liveStateCache) canHandleCluster(cluster *appv1.Cluster) bool { 728 return c.clusterSharding.IsManagedCluster(cluster) 729 } 730 731 func (c *liveStateCache) handleAddEvent(cluster *appv1.Cluster) { 732 c.clusterSharding.Add(cluster) 733 if !c.canHandleCluster(cluster) { 734 log.Infof("Ignoring cluster %s", cluster.Server) 735 return 736 } 737 c.lock.Lock() 738 _, ok := c.clusters[cluster.Server] 739 c.lock.Unlock() 740 if !ok { 741 log.Debugf("Checking if cache %v / cluster %v has appInformer %v", c, cluster, c.appInformer) 742 if c.appInformer == nil { 743 log.Warn("Cannot get a cluster appInformer. Cache may not be started this time") 744 return 745 } 746 if c.isClusterHasApps(c.appInformer.GetStore().List(), cluster) { 747 go func() { 748 // warm up cache for cluster with apps 749 _, _ = c.getSyncedCluster(cluster.Server) 750 }() 751 } 752 } 753 } 754 755 func (c *liveStateCache) handleModEvent(oldCluster *appv1.Cluster, newCluster *appv1.Cluster) { 756 c.clusterSharding.Update(oldCluster, newCluster) 757 c.lock.Lock() 758 cluster, ok := c.clusters[newCluster.Server] 759 c.lock.Unlock() 760 if ok { 761 if !c.canHandleCluster(newCluster) { 762 cluster.Invalidate() 763 c.lock.Lock() 764 delete(c.clusters, newCluster.Server) 765 c.lock.Unlock() 766 return 767 } 768 769 var updateSettings []clustercache.UpdateSettingsFunc 770 if !reflect.DeepEqual(oldCluster.Config, newCluster.Config) { 771 updateSettings = append(updateSettings, clustercache.SetConfig(newCluster.RESTConfig())) 772 } 773 if !reflect.DeepEqual(oldCluster.Namespaces, newCluster.Namespaces) { 774 updateSettings = append(updateSettings, clustercache.SetNamespaces(newCluster.Namespaces)) 775 } 776 if !reflect.DeepEqual(oldCluster.ClusterResources, newCluster.ClusterResources) { 777 updateSettings = append(updateSettings, clustercache.SetClusterResources(newCluster.ClusterResources)) 778 } 779 forceInvalidate := false 780 if newCluster.RefreshRequestedAt != nil && 781 cluster.GetClusterInfo().LastCacheSyncTime != nil && 782 cluster.GetClusterInfo().LastCacheSyncTime.Before(newCluster.RefreshRequestedAt.Time) { 783 forceInvalidate = true 784 } 785 786 if len(updateSettings) > 0 || forceInvalidate { 787 cluster.Invalidate(updateSettings...) 788 go func() { 789 // warm up cluster cache 790 _ = cluster.EnsureSynced() 791 }() 792 } 793 } 794 795 } 796 797 func (c *liveStateCache) handleDeleteEvent(clusterServer string) { 798 c.lock.RLock() 799 c.clusterSharding.Delete(clusterServer) 800 cluster, ok := c.clusters[clusterServer] 801 c.lock.RUnlock() 802 if ok { 803 cluster.Invalidate() 804 c.lock.Lock() 805 delete(c.clusters, clusterServer) 806 c.lock.Unlock() 807 } 808 } 809 810 func (c *liveStateCache) GetClustersInfo() []clustercache.ClusterInfo { 811 clusters := make(map[string]clustercache.ClusterCache) 812 c.lock.RLock() 813 for k := range c.clusters { 814 clusters[k] = c.clusters[k] 815 } 816 c.lock.RUnlock() 817 818 res := make([]clustercache.ClusterInfo, 0) 819 for server, c := range clusters { 820 info := c.GetClusterInfo() 821 info.Server = server 822 res = append(res, info) 823 } 824 return res 825 } 826 827 func (c *liveStateCache) GetClusterCache(server string) (clustercache.ClusterCache, error) { 828 return c.getSyncedCluster(server) 829 }