github.com/argoproj/argo-cd/v3@v3.2.1/controller/cache/cache.go (about) 1 package cache 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "math" 8 "net" 9 "net/url" 10 "os/exec" 11 "reflect" 12 "strconv" 13 "strings" 14 "sync" 15 "syscall" 16 "time" 17 18 clustercache "github.com/argoproj/gitops-engine/pkg/cache" 19 "github.com/argoproj/gitops-engine/pkg/health" 20 "github.com/argoproj/gitops-engine/pkg/utils/kube" 21 log "github.com/sirupsen/logrus" 22 "golang.org/x/sync/semaphore" 23 corev1 "k8s.io/api/core/v1" 24 apierrors "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 27 "k8s.io/apimachinery/pkg/runtime/schema" 28 "k8s.io/apimachinery/pkg/watch" 29 "k8s.io/client-go/rest" 30 "k8s.io/client-go/tools/cache" 31 32 "github.com/argoproj/argo-cd/v3/controller/metrics" 33 "github.com/argoproj/argo-cd/v3/controller/sharding" 34 "github.com/argoproj/argo-cd/v3/pkg/apis/application" 35 appv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1" 36 "github.com/argoproj/argo-cd/v3/util/argo" 37 "github.com/argoproj/argo-cd/v3/util/argo/normalizers" 38 "github.com/argoproj/argo-cd/v3/util/db" 39 "github.com/argoproj/argo-cd/v3/util/env" 40 logutils "github.com/argoproj/argo-cd/v3/util/log" 41 "github.com/argoproj/argo-cd/v3/util/lua" 42 "github.com/argoproj/argo-cd/v3/util/settings" 43 ) 44 45 const ( 46 // EnvClusterCacheResyncDuration is the env variable that holds cluster cache re-sync duration 47 EnvClusterCacheResyncDuration = "ARGOCD_CLUSTER_CACHE_RESYNC_DURATION" 48 49 // EnvClusterCacheWatchResyncDuration is the env variable that holds cluster cache watch re-sync duration 50 EnvClusterCacheWatchResyncDuration = "ARGOCD_CLUSTER_CACHE_WATCH_RESYNC_DURATION" 51 52 // EnvClusterSyncRetryTimeoutDuration is the env variable that holds cluster retry duration when sync error happens 53 EnvClusterSyncRetryTimeoutDuration = "ARGOCD_CLUSTER_SYNC_RETRY_TIMEOUT_DURATION" 54 55 // EnvClusterCacheListPageSize is the env variable to control size of the list page size when making K8s queries 56 EnvClusterCacheListPageSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_SIZE" 57 58 // EnvClusterCacheListPageBufferSize is the env variable to control the number of pages to buffer when making a K8s query to list resources 59 EnvClusterCacheListPageBufferSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_BUFFER_SIZE" 60 61 // EnvClusterCacheListSemaphore is the env variable to control size of the list semaphore 62 // This is used to limit the number of concurrent memory consuming operations on the 63 // k8s list queries results across all clusters to avoid memory spikes during cache initialization. 64 EnvClusterCacheListSemaphore = "ARGOCD_CLUSTER_CACHE_LIST_SEMAPHORE" 65 66 // EnvClusterCacheAttemptLimit is the env variable to control the retry limit for listing resources during cluster cache sync 67 EnvClusterCacheAttemptLimit = "ARGOCD_CLUSTER_CACHE_ATTEMPT_LIMIT" 68 69 // EnvClusterCacheRetryUseBackoff is the env variable to control whether to use a backoff strategy with the retry during cluster cache sync 70 EnvClusterCacheRetryUseBackoff = "ARGOCD_CLUSTER_CACHE_RETRY_USE_BACKOFF" 71 72 // EnvClusterCacheBatchEventsProcessing is the env variable to control whether to enable batch events processing 73 EnvClusterCacheBatchEventsProcessing = "ARGOCD_CLUSTER_CACHE_BATCH_EVENTS_PROCESSING" 74 75 // EnvClusterCacheEventsProcessingInterval is the env variable to control the interval between processing events when BatchEventsProcessing is enabled 76 EnvClusterCacheEventsProcessingInterval = "ARGOCD_CLUSTER_CACHE_EVENTS_PROCESSING_INTERVAL" 77 78 // AnnotationIgnoreResourceUpdates when set to true on an untracked resource, 79 // argo will apply `ignoreResourceUpdates` configuration on it. 80 AnnotationIgnoreResourceUpdates = "argocd.argoproj.io/ignore-resource-updates" 81 ) 82 83 // GitOps engine cluster cache tuning options 84 var ( 85 // clusterCacheResyncDuration controls the duration of cluster cache refresh. 86 // NOTE: this differs from gitops-engine default of 24h 87 clusterCacheResyncDuration = 12 * time.Hour 88 89 // clusterCacheWatchResyncDuration controls the maximum duration that group/kind watches are allowed to run 90 // for before relisting & restarting the watch 91 clusterCacheWatchResyncDuration = 10 * time.Minute 92 93 // clusterSyncRetryTimeoutDuration controls the sync retry duration when cluster sync error happens 94 clusterSyncRetryTimeoutDuration = 10 * time.Second 95 96 // The default limit of 50 is chosen based on experiments. 97 clusterCacheListSemaphoreSize int64 = 50 98 99 // clusterCacheListPageSize is the page size when performing K8s list requests. 100 // 500 is equal to kubectl's size 101 clusterCacheListPageSize int64 = 500 102 103 // clusterCacheListPageBufferSize is the number of pages to buffer when performing K8s list requests 104 clusterCacheListPageBufferSize int32 = 1 105 106 // clusterCacheRetryLimit sets a retry limit for failed requests during cluster cache sync 107 // If set to 1, retries are disabled. 108 clusterCacheAttemptLimit int32 = 1 109 110 // clusterCacheRetryUseBackoff specifies whether to use a backoff strategy on cluster cache sync, if retry is enabled 111 clusterCacheRetryUseBackoff = false 112 113 // clusterCacheBatchEventsProcessing specifies whether to enable batch events processing 114 clusterCacheBatchEventsProcessing = false 115 116 // clusterCacheEventsProcessingInterval specifies the interval between processing events when BatchEventsProcessing is enabled 117 clusterCacheEventsProcessingInterval = 100 * time.Millisecond 118 ) 119 120 func init() { 121 clusterCacheResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheResyncDuration, clusterCacheResyncDuration, 0, math.MaxInt64) 122 clusterCacheWatchResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheWatchResyncDuration, clusterCacheWatchResyncDuration, 0, math.MaxInt64) 123 clusterSyncRetryTimeoutDuration = env.ParseDurationFromEnv(EnvClusterSyncRetryTimeoutDuration, clusterSyncRetryTimeoutDuration, 0, math.MaxInt64) 124 clusterCacheListPageSize = env.ParseInt64FromEnv(EnvClusterCacheListPageSize, clusterCacheListPageSize, 0, math.MaxInt64) 125 clusterCacheListPageBufferSize = int32(env.ParseNumFromEnv(EnvClusterCacheListPageBufferSize, int(clusterCacheListPageBufferSize), 1, math.MaxInt32)) 126 clusterCacheListSemaphoreSize = env.ParseInt64FromEnv(EnvClusterCacheListSemaphore, clusterCacheListSemaphoreSize, 0, math.MaxInt64) 127 clusterCacheAttemptLimit = int32(env.ParseNumFromEnv(EnvClusterCacheAttemptLimit, int(clusterCacheAttemptLimit), 1, math.MaxInt32)) 128 clusterCacheRetryUseBackoff = env.ParseBoolFromEnv(EnvClusterCacheRetryUseBackoff, false) 129 clusterCacheBatchEventsProcessing = env.ParseBoolFromEnv(EnvClusterCacheBatchEventsProcessing, true) 130 clusterCacheEventsProcessingInterval = env.ParseDurationFromEnv(EnvClusterCacheEventsProcessingInterval, clusterCacheEventsProcessingInterval, 0, math.MaxInt64) 131 } 132 133 type LiveStateCache interface { 134 // Returns k8s server version 135 GetVersionsInfo(server *appv1.Cluster) (string, []kube.APIResourceInfo, error) 136 // Returns true of given group kind is a namespaced resource 137 IsNamespaced(server *appv1.Cluster, gk schema.GroupKind) (bool, error) 138 // Returns synced cluster cache 139 GetClusterCache(server *appv1.Cluster) (clustercache.ClusterCache, error) 140 // Executes give callback against resources specified by the keys and all its children 141 IterateHierarchyV2(server *appv1.Cluster, keys []kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error 142 // Returns state of live nodes which correspond for target nodes of specified application. 143 GetManagedLiveObjs(destCluster *appv1.Cluster, a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error) 144 // IterateResources iterates all resource stored in cache 145 IterateResources(server *appv1.Cluster, callback func(res *clustercache.Resource, info *ResourceInfo)) error 146 // Returns all top level resources (resources without owner references) of a specified namespace 147 GetNamespaceTopLevelResources(server *appv1.Cluster, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error) 148 // Starts watching resources of each controlled cluster. 149 Run(ctx context.Context) error 150 // Returns information about monitored clusters 151 GetClustersInfo() []clustercache.ClusterInfo 152 // Init must be executed before cache can be used 153 Init() error 154 // UpdateShard will update the shard of ClusterSharding when the shard has changed. 155 UpdateShard(shard int) bool 156 } 157 158 type ObjectUpdatedHandler = func(managedByApp map[string]bool, ref corev1.ObjectReference) 159 160 type PodInfo struct { 161 NodeName string 162 ResourceRequests corev1.ResourceList 163 Phase corev1.PodPhase 164 } 165 166 type NodeInfo struct { 167 Name string 168 Capacity corev1.ResourceList 169 SystemInfo corev1.NodeSystemInfo 170 Labels map[string]string 171 } 172 173 type ResourceInfo struct { 174 Info []appv1.InfoItem 175 AppName string 176 Images []string 177 Health *health.HealthStatus 178 // NetworkingInfo are available only for known types involved into networking: Ingress, Service, Pod 179 NetworkingInfo *appv1.ResourceNetworkingInfo 180 // PodInfo is available for pods only 181 PodInfo *PodInfo 182 // NodeInfo is available for nodes only 183 NodeInfo *NodeInfo 184 185 manifestHash string 186 } 187 188 func NewLiveStateCache( 189 db db.ArgoDB, 190 appInformer cache.SharedIndexInformer, 191 settingsMgr *settings.SettingsManager, 192 metricsServer *metrics.MetricsServer, 193 onObjectUpdated ObjectUpdatedHandler, 194 clusterSharding sharding.ClusterShardingCache, 195 resourceTracking argo.ResourceTracking, 196 ) LiveStateCache { 197 return &liveStateCache{ 198 appInformer: appInformer, 199 db: db, 200 clusters: make(map[string]clustercache.ClusterCache), 201 onObjectUpdated: onObjectUpdated, 202 settingsMgr: settingsMgr, 203 metricsServer: metricsServer, 204 clusterSharding: clusterSharding, 205 resourceTracking: resourceTracking, 206 } 207 } 208 209 type cacheSettings struct { 210 clusterSettings clustercache.Settings 211 appInstanceLabelKey string 212 trackingMethod appv1.TrackingMethod 213 installationID string 214 // resourceOverrides provides a list of ignored differences to ignore watched resource updates 215 resourceOverrides map[string]appv1.ResourceOverride 216 217 // ignoreResourceUpdates is a flag to enable resource-ignore rules. 218 ignoreResourceUpdatesEnabled bool 219 } 220 221 type liveStateCache struct { 222 db db.ArgoDB 223 appInformer cache.SharedIndexInformer 224 onObjectUpdated ObjectUpdatedHandler 225 settingsMgr *settings.SettingsManager 226 metricsServer *metrics.MetricsServer 227 clusterSharding sharding.ClusterShardingCache 228 resourceTracking argo.ResourceTracking 229 ignoreNormalizerOpts normalizers.IgnoreNormalizerOpts 230 231 clusters map[string]clustercache.ClusterCache 232 cacheSettings cacheSettings 233 lock sync.RWMutex 234 } 235 236 func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) { 237 appInstanceLabelKey, err := c.settingsMgr.GetAppInstanceLabelKey() 238 if err != nil { 239 return nil, err 240 } 241 trackingMethod, err := c.settingsMgr.GetTrackingMethod() 242 if err != nil { 243 return nil, err 244 } 245 installationID, err := c.settingsMgr.GetInstallationID() 246 if err != nil { 247 return nil, err 248 } 249 resourceUpdatesOverrides, err := c.settingsMgr.GetIgnoreResourceUpdatesOverrides() 250 if err != nil { 251 return nil, err 252 } 253 ignoreResourceUpdatesEnabled, err := c.settingsMgr.GetIsIgnoreResourceUpdatesEnabled() 254 if err != nil { 255 return nil, err 256 } 257 resourcesFilter, err := c.settingsMgr.GetResourcesFilter() 258 if err != nil { 259 return nil, err 260 } 261 resourceOverrides, err := c.settingsMgr.GetResourceOverrides() 262 if err != nil { 263 return nil, err 264 } 265 clusterSettings := clustercache.Settings{ 266 ResourceHealthOverride: lua.ResourceHealthOverrides(resourceOverrides), 267 ResourcesFilter: resourcesFilter, 268 } 269 270 return &cacheSettings{clusterSettings, appInstanceLabelKey, appv1.TrackingMethod(trackingMethod), installationID, resourceUpdatesOverrides, ignoreResourceUpdatesEnabled}, nil 271 } 272 273 func asResourceNode(r *clustercache.Resource) appv1.ResourceNode { 274 gv, err := schema.ParseGroupVersion(r.Ref.APIVersion) 275 if err != nil { 276 gv = schema.GroupVersion{} 277 } 278 parentRefs := make([]appv1.ResourceRef, len(r.OwnerRefs)) 279 for i, ownerRef := range r.OwnerRefs { 280 ownerGvk := schema.FromAPIVersionAndKind(ownerRef.APIVersion, ownerRef.Kind) 281 parentRefs[i] = appv1.ResourceRef{ 282 Group: ownerGvk.Group, 283 Kind: ownerGvk.Kind, 284 Version: ownerGvk.Version, 285 Namespace: r.Ref.Namespace, 286 Name: ownerRef.Name, 287 UID: string(ownerRef.UID), 288 } 289 } 290 var resHealth *appv1.HealthStatus 291 resourceInfo := resInfo(r) 292 if resourceInfo.Health != nil { 293 resHealth = &appv1.HealthStatus{Status: resourceInfo.Health.Status, Message: resourceInfo.Health.Message} 294 } 295 return appv1.ResourceNode{ 296 ResourceRef: appv1.ResourceRef{ 297 UID: string(r.Ref.UID), 298 Name: r.Ref.Name, 299 Group: gv.Group, 300 Version: gv.Version, 301 Kind: r.Ref.Kind, 302 Namespace: r.Ref.Namespace, 303 }, 304 ParentRefs: parentRefs, 305 Info: resourceInfo.Info, 306 ResourceVersion: r.ResourceVersion, 307 NetworkingInfo: resourceInfo.NetworkingInfo, 308 Images: resourceInfo.Images, 309 Health: resHealth, 310 CreatedAt: r.CreationTimestamp, 311 } 312 } 313 314 func resInfo(r *clustercache.Resource) *ResourceInfo { 315 info, ok := r.Info.(*ResourceInfo) 316 if !ok || info == nil { 317 info = &ResourceInfo{} 318 } 319 return info 320 } 321 322 func isRootAppNode(r *clustercache.Resource) bool { 323 return resInfo(r).AppName != "" && len(r.OwnerRefs) == 0 324 } 325 326 func getApp(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource) string { 327 name, _ := getAppRecursive(r, ns, map[kube.ResourceKey]bool{}) 328 return name 329 } 330 331 func ownerRefGV(ownerRef metav1.OwnerReference) schema.GroupVersion { 332 gv, err := schema.ParseGroupVersion(ownerRef.APIVersion) 333 if err != nil { 334 gv = schema.GroupVersion{} 335 } 336 return gv 337 } 338 339 func getAppRecursive(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource, visited map[kube.ResourceKey]bool) (string, bool) { 340 if visited[r.ResourceKey()] { 341 log.Warnf("Circular dependency detected: %v.", visited) 342 return resInfo(r).AppName, false 343 } 344 visited[r.ResourceKey()] = true 345 346 if resInfo(r).AppName != "" { 347 return resInfo(r).AppName, true 348 } 349 for _, ownerRef := range r.OwnerRefs { 350 gv := ownerRefGV(ownerRef) 351 if parent, ok := ns[kube.NewResourceKey(gv.Group, ownerRef.Kind, r.Ref.Namespace, ownerRef.Name)]; ok { 352 visitedBranch := make(map[kube.ResourceKey]bool, len(visited)) 353 for k, v := range visited { 354 visitedBranch[k] = v 355 } 356 app, ok := getAppRecursive(parent, ns, visitedBranch) 357 if app != "" || !ok { 358 return app, ok 359 } 360 } 361 } 362 return "", true 363 } 364 365 var ignoredRefreshResources = map[string]bool{ 366 "/" + kube.EndpointsKind: true, 367 } 368 369 // skipAppRequeuing checks if the object is an API type which we want to skip requeuing against. 370 // We ignore API types which have a high churn rate, and/or whose updates are irrelevant to the app 371 func skipAppRequeuing(key kube.ResourceKey) bool { 372 return ignoredRefreshResources[key.Group+"/"+key.Kind] 373 } 374 375 func skipResourceUpdate(oldInfo, newInfo *ResourceInfo) bool { 376 if oldInfo == nil || newInfo == nil { 377 return false 378 } 379 isSameHealthStatus := (oldInfo.Health == nil && newInfo.Health == nil) || oldInfo.Health != nil && newInfo.Health != nil && oldInfo.Health.Status == newInfo.Health.Status 380 isSameManifest := oldInfo.manifestHash != "" && newInfo.manifestHash != "" && oldInfo.manifestHash == newInfo.manifestHash 381 return isSameHealthStatus && isSameManifest 382 } 383 384 // shouldHashManifest validates if the API resource needs to be hashed. 385 // If there's an app name from resource tracking, or if this is itself an app, we should generate a hash. 386 // Otherwise, the hashing should be skipped to save CPU time. 387 func shouldHashManifest(appName string, gvk schema.GroupVersionKind, un *unstructured.Unstructured) bool { 388 // Only hash if the resource belongs to an app OR argocd.argoproj.io/ignore-resource-updates is present and set to true 389 // Best - Only hash for resources that are part of an app or their dependencies 390 // (current) - Only hash for resources that are part of an app + all apps that might be from an ApplicationSet 391 // Orphan - If orphan is enabled, hash should be made on all resource of that namespace and a config to disable it 392 // Worst - Hash all resources watched by Argo 393 isTrackedResource := appName != "" || (gvk.Group == application.Group && gvk.Kind == application.ApplicationKind) 394 395 // If the resource is not a tracked resource, we will look up argocd.argoproj.io/ignore-resource-updates and decide 396 // whether we generate hash or not. 397 // If argocd.argoproj.io/ignore-resource-updates is presented and is true, return true 398 // Else return false 399 if !isTrackedResource { 400 if val, ok := un.GetAnnotations()[AnnotationIgnoreResourceUpdates]; ok { 401 applyResourcesUpdate, err := strconv.ParseBool(val) 402 if err != nil { 403 applyResourcesUpdate = false 404 } 405 return applyResourcesUpdate 406 } 407 return false 408 } 409 410 return isTrackedResource 411 } 412 413 // isRetryableError is a helper method to see whether an error 414 // returned from the dynamic client is potentially retryable. 415 func isRetryableError(err error) bool { 416 if err == nil { 417 return false 418 } 419 return apierrors.IsInternalError(err) || 420 apierrors.IsInvalid(err) || 421 apierrors.IsTooManyRequests(err) || 422 apierrors.IsServerTimeout(err) || 423 apierrors.IsServiceUnavailable(err) || 424 apierrors.IsTimeout(err) || 425 apierrors.IsUnexpectedObjectError(err) || 426 apierrors.IsUnexpectedServerError(err) || 427 isResourceQuotaConflictErr(err) || 428 isTransientNetworkErr(err) || 429 isExceededQuotaErr(err) || 430 isHTTP2GoawayErr(err) || 431 errors.Is(err, syscall.ECONNRESET) 432 } 433 434 func isHTTP2GoawayErr(err error) bool { 435 return strings.Contains(err.Error(), "http2: server sent GOAWAY and closed the connection") 436 } 437 438 func isExceededQuotaErr(err error) bool { 439 return apierrors.IsForbidden(err) && strings.Contains(err.Error(), "exceeded quota") 440 } 441 442 func isResourceQuotaConflictErr(err error) bool { 443 return apierrors.IsConflict(err) && strings.Contains(err.Error(), "Operation cannot be fulfilled on resourcequota") 444 } 445 446 func isTransientNetworkErr(err error) bool { 447 var netErr net.Error 448 if errors.As(err, &netErr) { 449 var dnsErr *net.DNSError 450 var opErr *net.OpError 451 var unknownNetworkErr net.UnknownNetworkError 452 var urlErr *url.Error 453 switch { 454 case errors.As(err, &dnsErr), errors.As(err, &opErr), errors.As(err, &unknownNetworkErr): 455 return true 456 case errors.As(err, &urlErr): 457 // For a URL error, where it replies "connection closed" 458 // retry again. 459 return strings.Contains(err.Error(), "Connection closed by foreign host") 460 } 461 } 462 463 errorString := err.Error() 464 var exitErr *exec.ExitError 465 if errors.As(err, &exitErr) { 466 errorString = fmt.Sprintf("%s %s", errorString, exitErr.Stderr) 467 } 468 if strings.Contains(errorString, "net/http: TLS handshake timeout") || 469 strings.Contains(errorString, "i/o timeout") || 470 strings.Contains(errorString, "connection timed out") || 471 strings.Contains(errorString, "connection reset by peer") { 472 return true 473 } 474 return false 475 } 476 477 func (c *liveStateCache) getCluster(cluster *appv1.Cluster) (clustercache.ClusterCache, error) { 478 c.lock.RLock() 479 clusterCache, ok := c.clusters[cluster.Server] 480 cacheSettings := c.cacheSettings 481 c.lock.RUnlock() 482 483 if ok { 484 return clusterCache, nil 485 } 486 487 c.lock.Lock() 488 defer c.lock.Unlock() 489 490 clusterCache, ok = c.clusters[cluster.Server] 491 if ok { 492 return clusterCache, nil 493 } 494 495 if c.clusterSharding == nil { 496 return nil, fmt.Errorf("unable to handle cluster %s: cluster sharding is not configured", cluster.Server) 497 } 498 499 if !c.canHandleCluster(cluster) { 500 return nil, fmt.Errorf("controller is configured to ignore cluster %s", cluster.Server) 501 } 502 503 resourceCustomLabels, err := c.settingsMgr.GetResourceCustomLabels() 504 if err != nil { 505 return nil, fmt.Errorf("error getting custom label: %w", err) 506 } 507 508 respectRBAC, err := c.settingsMgr.RespectRBAC() 509 if err != nil { 510 return nil, fmt.Errorf("error getting value for %v: %w", settings.RespectRBAC, err) 511 } 512 513 clusterCacheConfig, err := cluster.RESTConfig() 514 if err != nil { 515 return nil, fmt.Errorf("error getting cluster RESTConfig: %w", err) 516 } 517 // Controller dynamically fetches all resource types available on the cluster 518 // using a discovery API that may contain deprecated APIs. 519 // This causes log flooding when managing a large number of clusters. 520 // https://github.com/argoproj/argo-cd/issues/11973 521 // However, we can safely suppress deprecation warnings 522 // because we do not rely on resources with a particular API group or version. 523 // https://kubernetes.io/blog/2020/09/03/warnings/#customize-client-handling 524 // 525 // Completely suppress warning logs only for log levels that are less than Debug. 526 if log.GetLevel() < log.DebugLevel { 527 clusterCacheConfig.WarningHandler = rest.NoWarnings{} 528 } 529 530 clusterCacheOpts := []clustercache.UpdateSettingsFunc{ 531 clustercache.SetListSemaphore(semaphore.NewWeighted(clusterCacheListSemaphoreSize)), 532 clustercache.SetListPageSize(clusterCacheListPageSize), 533 clustercache.SetListPageBufferSize(clusterCacheListPageBufferSize), 534 clustercache.SetWatchResyncTimeout(clusterCacheWatchResyncDuration), 535 clustercache.SetClusterSyncRetryTimeout(clusterSyncRetryTimeoutDuration), 536 clustercache.SetResyncTimeout(clusterCacheResyncDuration), 537 clustercache.SetSettings(cacheSettings.clusterSettings), 538 clustercache.SetNamespaces(cluster.Namespaces), 539 clustercache.SetClusterResources(cluster.ClusterResources), 540 clustercache.SetPopulateResourceInfoHandler(func(un *unstructured.Unstructured, isRoot bool) (any, bool) { 541 res := &ResourceInfo{} 542 populateNodeInfo(un, res, resourceCustomLabels) 543 c.lock.RLock() 544 cacheSettings := c.cacheSettings 545 c.lock.RUnlock() 546 547 res.Health, _ = health.GetResourceHealth(un, cacheSettings.clusterSettings.ResourceHealthOverride) 548 549 appName := c.resourceTracking.GetAppName(un, cacheSettings.appInstanceLabelKey, cacheSettings.trackingMethod, cacheSettings.installationID) 550 if isRoot && appName != "" { 551 res.AppName = appName 552 } 553 554 gvk := un.GroupVersionKind() 555 556 if cacheSettings.ignoreResourceUpdatesEnabled && shouldHashManifest(appName, gvk, un) { 557 hash, err := generateManifestHash(un, nil, cacheSettings.resourceOverrides, c.ignoreNormalizerOpts) 558 if err != nil { 559 log.Errorf("Failed to generate manifest hash: %v", err) 560 } else { 561 res.manifestHash = hash 562 } 563 } 564 565 // edge case. we do not label CRDs, so they miss the tracking label we inject. But we still 566 // want the full resource to be available in our cache (to diff), so we store all CRDs 567 return res, res.AppName != "" || gvk.Kind == kube.CustomResourceDefinitionKind 568 }), 569 clustercache.SetLogr(logutils.NewLogrusLogger(log.WithField("server", cluster.Server))), 570 clustercache.SetRetryOptions(clusterCacheAttemptLimit, clusterCacheRetryUseBackoff, isRetryableError), 571 clustercache.SetRespectRBAC(respectRBAC), 572 clustercache.SetBatchEventsProcessing(clusterCacheBatchEventsProcessing), 573 clustercache.SetEventProcessingInterval(clusterCacheEventsProcessingInterval), 574 } 575 576 clusterCache = clustercache.NewClusterCache(clusterCacheConfig, clusterCacheOpts...) 577 578 _ = clusterCache.OnResourceUpdated(func(newRes *clustercache.Resource, oldRes *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) { 579 toNotify := make(map[string]bool) 580 var ref corev1.ObjectReference 581 if newRes != nil { 582 ref = newRes.Ref 583 } else { 584 ref = oldRes.Ref 585 } 586 587 c.lock.RLock() 588 cacheSettings := c.cacheSettings 589 c.lock.RUnlock() 590 591 if cacheSettings.ignoreResourceUpdatesEnabled && oldRes != nil && newRes != nil && skipResourceUpdate(resInfo(oldRes), resInfo(newRes)) { 592 // Additional check for debug level so we don't need to evaluate the 593 // format string in case of non-debug scenarios 594 if log.GetLevel() >= log.DebugLevel { 595 namespace := ref.Namespace 596 if ref.Namespace == "" { 597 namespace = "(cluster-scoped)" 598 } 599 log.WithFields(log.Fields{ 600 "server": cluster.Server, 601 "namespace": namespace, 602 "name": ref.Name, 603 "api-version": ref.APIVersion, 604 "kind": ref.Kind, 605 }).Debug("Ignoring change of object because none of the watched resource fields have changed") 606 } 607 return 608 } 609 610 for _, r := range []*clustercache.Resource{newRes, oldRes} { 611 if r == nil { 612 continue 613 } 614 app := getApp(r, namespaceResources) 615 if app == "" || skipAppRequeuing(r.ResourceKey()) { 616 continue 617 } 618 toNotify[app] = isRootAppNode(r) || toNotify[app] 619 } 620 c.onObjectUpdated(toNotify, ref) 621 }) 622 623 _ = clusterCache.OnEvent(func(_ watch.EventType, un *unstructured.Unstructured) { 624 gvk := un.GroupVersionKind() 625 c.metricsServer.IncClusterEventsCount(cluster.Server, gvk.Group, gvk.Kind) 626 }) 627 628 _ = clusterCache.OnProcessEventsHandler(func(duration time.Duration, processedEventsNumber int) { 629 c.metricsServer.ObserveResourceEventsProcessingDuration(cluster.Server, duration, processedEventsNumber) 630 }) 631 632 c.clusters[cluster.Server] = clusterCache 633 634 return clusterCache, nil 635 } 636 637 func (c *liveStateCache) getSyncedCluster(server *appv1.Cluster) (clustercache.ClusterCache, error) { 638 clusterCache, err := c.getCluster(server) 639 if err != nil { 640 return nil, fmt.Errorf("error getting cluster: %w", err) 641 } 642 err = clusterCache.EnsureSynced() 643 if err != nil { 644 return nil, fmt.Errorf("error synchronizing cache state : %w", err) 645 } 646 return clusterCache, nil 647 } 648 649 func (c *liveStateCache) invalidate(cacheSettings cacheSettings) { 650 log.Info("invalidating live state cache") 651 c.lock.Lock() 652 c.cacheSettings = cacheSettings 653 clusters := c.clusters 654 c.lock.Unlock() 655 656 for _, clust := range clusters { 657 clust.Invalidate(clustercache.SetSettings(cacheSettings.clusterSettings)) 658 } 659 log.Info("live state cache invalidated") 660 } 661 662 func (c *liveStateCache) IsNamespaced(server *appv1.Cluster, gk schema.GroupKind) (bool, error) { 663 clusterInfo, err := c.getSyncedCluster(server) 664 if err != nil { 665 return false, err 666 } 667 return clusterInfo.IsNamespaced(gk) 668 } 669 670 func (c *liveStateCache) IterateHierarchyV2(server *appv1.Cluster, keys []kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error { 671 clusterInfo, err := c.getSyncedCluster(server) 672 if err != nil { 673 return err 674 } 675 clusterInfo.IterateHierarchyV2(keys, func(resource *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) bool { 676 return action(asResourceNode(resource), getApp(resource, namespaceResources)) 677 }) 678 return nil 679 } 680 681 func (c *liveStateCache) IterateResources(server *appv1.Cluster, callback func(res *clustercache.Resource, info *ResourceInfo)) error { 682 clusterInfo, err := c.getSyncedCluster(server) 683 if err != nil { 684 return err 685 } 686 _ = clusterInfo.FindResources("", func(r *clustercache.Resource) bool { 687 if info, ok := r.Info.(*ResourceInfo); ok { 688 callback(r, info) 689 } 690 return false 691 }) 692 return nil 693 } 694 695 func (c *liveStateCache) GetNamespaceTopLevelResources(server *appv1.Cluster, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error) { 696 clusterInfo, err := c.getSyncedCluster(server) 697 if err != nil { 698 return nil, err 699 } 700 resources := clusterInfo.FindResources(namespace, clustercache.TopLevelResource) 701 res := make(map[kube.ResourceKey]appv1.ResourceNode) 702 for k, r := range resources { 703 res[k] = asResourceNode(r) 704 } 705 return res, nil 706 } 707 708 func (c *liveStateCache) GetManagedLiveObjs(destCluster *appv1.Cluster, a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error) { 709 clusterInfo, err := c.getSyncedCluster(destCluster) 710 if err != nil { 711 return nil, fmt.Errorf("failed to get cluster info for %q: %w", destCluster.Server, err) 712 } 713 return clusterInfo.GetManagedLiveObjs(targetObjs, func(r *clustercache.Resource) bool { 714 return resInfo(r).AppName == a.InstanceName(c.settingsMgr.GetNamespace()) 715 }) 716 } 717 718 func (c *liveStateCache) GetVersionsInfo(server *appv1.Cluster) (string, []kube.APIResourceInfo, error) { 719 clusterInfo, err := c.getSyncedCluster(server) 720 if err != nil { 721 return "", nil, fmt.Errorf("failed to get cluster info for %q: %w", server.Server, err) 722 } 723 return clusterInfo.GetServerVersion(), clusterInfo.GetAPIResources(), nil 724 } 725 726 func (c *liveStateCache) isClusterHasApps(apps []any, cluster *appv1.Cluster) bool { 727 for _, obj := range apps { 728 app, ok := obj.(*appv1.Application) 729 if !ok { 730 continue 731 } 732 destCluster, err := argo.GetDestinationCluster(context.Background(), app.Spec.Destination, c.db) 733 if err != nil { 734 log.Warnf("Failed to get destination cluster: %v", err) 735 continue 736 } 737 if destCluster.Server == cluster.Server { 738 return true 739 } 740 } 741 return false 742 } 743 744 func (c *liveStateCache) watchSettings(ctx context.Context) { 745 updateCh := make(chan *settings.ArgoCDSettings, 1) 746 c.settingsMgr.Subscribe(updateCh) 747 748 done := false 749 for !done { 750 select { 751 case <-updateCh: 752 nextCacheSettings, err := c.loadCacheSettings() 753 if err != nil { 754 log.Warnf("Failed to read updated settings: %v", err) 755 continue 756 } 757 758 c.lock.Lock() 759 needInvalidate := false 760 if !reflect.DeepEqual(c.cacheSettings, *nextCacheSettings) { 761 c.cacheSettings = *nextCacheSettings 762 needInvalidate = true 763 } 764 c.lock.Unlock() 765 if needInvalidate { 766 c.invalidate(*nextCacheSettings) 767 } 768 case <-ctx.Done(): 769 done = true 770 } 771 } 772 log.Info("shutting down settings watch") 773 c.settingsMgr.Unsubscribe(updateCh) 774 close(updateCh) 775 } 776 777 func (c *liveStateCache) Init() error { 778 cacheSettings, err := c.loadCacheSettings() 779 if err != nil { 780 return fmt.Errorf("error loading cache settings: %w", err) 781 } 782 c.cacheSettings = *cacheSettings 783 return nil 784 } 785 786 // Run watches for resource changes annotated with application label on all registered clusters and schedule corresponding app refresh. 787 func (c *liveStateCache) Run(ctx context.Context) error { 788 go c.watchSettings(ctx) 789 790 kube.RetryUntilSucceed(ctx, clustercache.ClusterRetryTimeout, "watch clusters", logutils.NewLogrusLogger(logutils.NewWithCurrentConfig()), func() error { 791 return c.db.WatchClusters(ctx, c.handleAddEvent, c.handleModEvent, c.handleDeleteEvent) 792 }) 793 794 <-ctx.Done() 795 c.invalidate(c.cacheSettings) 796 return nil 797 } 798 799 func (c *liveStateCache) canHandleCluster(cluster *appv1.Cluster) bool { 800 return c.clusterSharding.IsManagedCluster(cluster) 801 } 802 803 func (c *liveStateCache) handleAddEvent(cluster *appv1.Cluster) { 804 c.clusterSharding.Add(cluster) 805 if !c.canHandleCluster(cluster) { 806 log.Infof("Ignoring cluster %s", cluster.Server) 807 return 808 } 809 c.lock.Lock() 810 _, ok := c.clusters[cluster.Server] 811 c.lock.Unlock() 812 if !ok { 813 log.Debugf("Checking if cache %v / cluster %v has appInformer %v", c, cluster, c.appInformer) 814 if c.appInformer == nil { 815 log.Warn("Cannot get a cluster appInformer. Cache may not be started this time") 816 return 817 } 818 if c.isClusterHasApps(c.appInformer.GetStore().List(), cluster) { 819 go func() { 820 // warm up cache for cluster with apps 821 _, _ = c.getSyncedCluster(cluster) 822 }() 823 } 824 } 825 } 826 827 func (c *liveStateCache) handleModEvent(oldCluster *appv1.Cluster, newCluster *appv1.Cluster) { 828 c.clusterSharding.Update(oldCluster, newCluster) 829 c.lock.Lock() 830 cluster, ok := c.clusters[newCluster.Server] 831 c.lock.Unlock() 832 if ok { 833 if !c.canHandleCluster(newCluster) { 834 cluster.Invalidate() 835 c.lock.Lock() 836 delete(c.clusters, newCluster.Server) 837 c.lock.Unlock() 838 return 839 } 840 841 var updateSettings []clustercache.UpdateSettingsFunc 842 if !reflect.DeepEqual(oldCluster.Config, newCluster.Config) { 843 newClusterRESTConfig, err := newCluster.RESTConfig() 844 if err == nil { 845 updateSettings = append(updateSettings, clustercache.SetConfig(newClusterRESTConfig)) 846 } else { 847 log.Errorf("error getting cluster REST config: %v", err) 848 } 849 } 850 if !reflect.DeepEqual(oldCluster.Namespaces, newCluster.Namespaces) { 851 updateSettings = append(updateSettings, clustercache.SetNamespaces(newCluster.Namespaces)) 852 } 853 if !reflect.DeepEqual(oldCluster.ClusterResources, newCluster.ClusterResources) { 854 updateSettings = append(updateSettings, clustercache.SetClusterResources(newCluster.ClusterResources)) 855 } 856 forceInvalidate := false 857 if newCluster.RefreshRequestedAt != nil && 858 cluster.GetClusterInfo().LastCacheSyncTime != nil && 859 cluster.GetClusterInfo().LastCacheSyncTime.Before(newCluster.RefreshRequestedAt.Time) { 860 forceInvalidate = true 861 } 862 863 if len(updateSettings) > 0 || forceInvalidate { 864 cluster.Invalidate(updateSettings...) 865 go func() { 866 // warm up cluster cache 867 _ = cluster.EnsureSynced() 868 }() 869 } 870 } 871 } 872 873 func (c *liveStateCache) handleDeleteEvent(clusterServer string) { 874 c.lock.RLock() 875 c.clusterSharding.Delete(clusterServer) 876 cluster, ok := c.clusters[clusterServer] 877 c.lock.RUnlock() 878 if ok { 879 cluster.Invalidate() 880 c.lock.Lock() 881 delete(c.clusters, clusterServer) 882 c.lock.Unlock() 883 } 884 } 885 886 func (c *liveStateCache) GetClustersInfo() []clustercache.ClusterInfo { 887 clusters := make(map[string]clustercache.ClusterCache) 888 c.lock.RLock() 889 for k := range c.clusters { 890 clusters[k] = c.clusters[k] 891 } 892 c.lock.RUnlock() 893 894 res := make([]clustercache.ClusterInfo, 0) 895 for server, c := range clusters { 896 info := c.GetClusterInfo() 897 info.Server = server 898 res = append(res, info) 899 } 900 return res 901 } 902 903 func (c *liveStateCache) GetClusterCache(server *appv1.Cluster) (clustercache.ClusterCache, error) { 904 return c.getSyncedCluster(server) 905 } 906 907 // UpdateShard will update the shard of ClusterSharding when the shard has changed. 908 func (c *liveStateCache) UpdateShard(shard int) bool { 909 return c.clusterSharding.UpdateShard(shard) 910 }