github.com/argoproj/argo-cd/v3@v3.2.1/controller/cache/cache.go (about)

     1  package cache
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  	"net"
     9  	"net/url"
    10  	"os/exec"
    11  	"reflect"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"syscall"
    16  	"time"
    17  
    18  	clustercache "github.com/argoproj/gitops-engine/pkg/cache"
    19  	"github.com/argoproj/gitops-engine/pkg/health"
    20  	"github.com/argoproj/gitops-engine/pkg/utils/kube"
    21  	log "github.com/sirupsen/logrus"
    22  	"golang.org/x/sync/semaphore"
    23  	corev1 "k8s.io/api/core/v1"
    24  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    27  	"k8s.io/apimachinery/pkg/runtime/schema"
    28  	"k8s.io/apimachinery/pkg/watch"
    29  	"k8s.io/client-go/rest"
    30  	"k8s.io/client-go/tools/cache"
    31  
    32  	"github.com/argoproj/argo-cd/v3/controller/metrics"
    33  	"github.com/argoproj/argo-cd/v3/controller/sharding"
    34  	"github.com/argoproj/argo-cd/v3/pkg/apis/application"
    35  	appv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
    36  	"github.com/argoproj/argo-cd/v3/util/argo"
    37  	"github.com/argoproj/argo-cd/v3/util/argo/normalizers"
    38  	"github.com/argoproj/argo-cd/v3/util/db"
    39  	"github.com/argoproj/argo-cd/v3/util/env"
    40  	logutils "github.com/argoproj/argo-cd/v3/util/log"
    41  	"github.com/argoproj/argo-cd/v3/util/lua"
    42  	"github.com/argoproj/argo-cd/v3/util/settings"
    43  )
    44  
    45  const (
    46  	// EnvClusterCacheResyncDuration is the env variable that holds cluster cache re-sync duration
    47  	EnvClusterCacheResyncDuration = "ARGOCD_CLUSTER_CACHE_RESYNC_DURATION"
    48  
    49  	// EnvClusterCacheWatchResyncDuration is the env variable that holds cluster cache watch re-sync duration
    50  	EnvClusterCacheWatchResyncDuration = "ARGOCD_CLUSTER_CACHE_WATCH_RESYNC_DURATION"
    51  
    52  	// EnvClusterSyncRetryTimeoutDuration is the env variable that holds cluster retry duration when sync error happens
    53  	EnvClusterSyncRetryTimeoutDuration = "ARGOCD_CLUSTER_SYNC_RETRY_TIMEOUT_DURATION"
    54  
    55  	// EnvClusterCacheListPageSize is the env variable to control size of the list page size when making K8s queries
    56  	EnvClusterCacheListPageSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_SIZE"
    57  
    58  	// EnvClusterCacheListPageBufferSize is the env variable to control the number of pages to buffer when making a K8s query to list resources
    59  	EnvClusterCacheListPageBufferSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_BUFFER_SIZE"
    60  
    61  	// EnvClusterCacheListSemaphore is the env variable to control size of the list semaphore
    62  	// This is used to limit the number of concurrent memory consuming operations on the
    63  	// k8s list queries results across all clusters to avoid memory spikes during cache initialization.
    64  	EnvClusterCacheListSemaphore = "ARGOCD_CLUSTER_CACHE_LIST_SEMAPHORE"
    65  
    66  	// EnvClusterCacheAttemptLimit is the env variable to control the retry limit for listing resources during cluster cache sync
    67  	EnvClusterCacheAttemptLimit = "ARGOCD_CLUSTER_CACHE_ATTEMPT_LIMIT"
    68  
    69  	// EnvClusterCacheRetryUseBackoff is the env variable to control whether to use a backoff strategy with the retry during cluster cache sync
    70  	EnvClusterCacheRetryUseBackoff = "ARGOCD_CLUSTER_CACHE_RETRY_USE_BACKOFF"
    71  
    72  	// EnvClusterCacheBatchEventsProcessing is the env variable to control whether to enable batch events processing
    73  	EnvClusterCacheBatchEventsProcessing = "ARGOCD_CLUSTER_CACHE_BATCH_EVENTS_PROCESSING"
    74  
    75  	// EnvClusterCacheEventsProcessingInterval is the env variable to control the interval between processing events when BatchEventsProcessing is enabled
    76  	EnvClusterCacheEventsProcessingInterval = "ARGOCD_CLUSTER_CACHE_EVENTS_PROCESSING_INTERVAL"
    77  
    78  	// AnnotationIgnoreResourceUpdates when set to true on an untracked resource,
    79  	// argo will apply `ignoreResourceUpdates` configuration on it.
    80  	AnnotationIgnoreResourceUpdates = "argocd.argoproj.io/ignore-resource-updates"
    81  )
    82  
    83  // GitOps engine cluster cache tuning options
    84  var (
    85  	// clusterCacheResyncDuration controls the duration of cluster cache refresh.
    86  	// NOTE: this differs from gitops-engine default of 24h
    87  	clusterCacheResyncDuration = 12 * time.Hour
    88  
    89  	// clusterCacheWatchResyncDuration controls the maximum duration that group/kind watches are allowed to run
    90  	// for before relisting & restarting the watch
    91  	clusterCacheWatchResyncDuration = 10 * time.Minute
    92  
    93  	// clusterSyncRetryTimeoutDuration controls the sync retry duration when cluster sync error happens
    94  	clusterSyncRetryTimeoutDuration = 10 * time.Second
    95  
    96  	// The default limit of 50 is chosen based on experiments.
    97  	clusterCacheListSemaphoreSize int64 = 50
    98  
    99  	// clusterCacheListPageSize is the page size when performing K8s list requests.
   100  	// 500 is equal to kubectl's size
   101  	clusterCacheListPageSize int64 = 500
   102  
   103  	// clusterCacheListPageBufferSize is the number of pages to buffer when performing K8s list requests
   104  	clusterCacheListPageBufferSize int32 = 1
   105  
   106  	// clusterCacheRetryLimit sets a retry limit for failed requests during cluster cache sync
   107  	// If set to 1, retries are disabled.
   108  	clusterCacheAttemptLimit int32 = 1
   109  
   110  	// clusterCacheRetryUseBackoff specifies whether to use a backoff strategy on cluster cache sync, if retry is enabled
   111  	clusterCacheRetryUseBackoff = false
   112  
   113  	// clusterCacheBatchEventsProcessing specifies whether to enable batch events processing
   114  	clusterCacheBatchEventsProcessing = false
   115  
   116  	// clusterCacheEventsProcessingInterval specifies the interval between processing events when BatchEventsProcessing is enabled
   117  	clusterCacheEventsProcessingInterval = 100 * time.Millisecond
   118  )
   119  
   120  func init() {
   121  	clusterCacheResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheResyncDuration, clusterCacheResyncDuration, 0, math.MaxInt64)
   122  	clusterCacheWatchResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheWatchResyncDuration, clusterCacheWatchResyncDuration, 0, math.MaxInt64)
   123  	clusterSyncRetryTimeoutDuration = env.ParseDurationFromEnv(EnvClusterSyncRetryTimeoutDuration, clusterSyncRetryTimeoutDuration, 0, math.MaxInt64)
   124  	clusterCacheListPageSize = env.ParseInt64FromEnv(EnvClusterCacheListPageSize, clusterCacheListPageSize, 0, math.MaxInt64)
   125  	clusterCacheListPageBufferSize = int32(env.ParseNumFromEnv(EnvClusterCacheListPageBufferSize, int(clusterCacheListPageBufferSize), 1, math.MaxInt32))
   126  	clusterCacheListSemaphoreSize = env.ParseInt64FromEnv(EnvClusterCacheListSemaphore, clusterCacheListSemaphoreSize, 0, math.MaxInt64)
   127  	clusterCacheAttemptLimit = int32(env.ParseNumFromEnv(EnvClusterCacheAttemptLimit, int(clusterCacheAttemptLimit), 1, math.MaxInt32))
   128  	clusterCacheRetryUseBackoff = env.ParseBoolFromEnv(EnvClusterCacheRetryUseBackoff, false)
   129  	clusterCacheBatchEventsProcessing = env.ParseBoolFromEnv(EnvClusterCacheBatchEventsProcessing, true)
   130  	clusterCacheEventsProcessingInterval = env.ParseDurationFromEnv(EnvClusterCacheEventsProcessingInterval, clusterCacheEventsProcessingInterval, 0, math.MaxInt64)
   131  }
   132  
   133  type LiveStateCache interface {
   134  	// Returns k8s server version
   135  	GetVersionsInfo(server *appv1.Cluster) (string, []kube.APIResourceInfo, error)
   136  	// Returns true of given group kind is a namespaced resource
   137  	IsNamespaced(server *appv1.Cluster, gk schema.GroupKind) (bool, error)
   138  	// Returns synced cluster cache
   139  	GetClusterCache(server *appv1.Cluster) (clustercache.ClusterCache, error)
   140  	// Executes give callback against resources specified by the keys and all its children
   141  	IterateHierarchyV2(server *appv1.Cluster, keys []kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error
   142  	// Returns state of live nodes which correspond for target nodes of specified application.
   143  	GetManagedLiveObjs(destCluster *appv1.Cluster, a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error)
   144  	// IterateResources iterates all resource stored in cache
   145  	IterateResources(server *appv1.Cluster, callback func(res *clustercache.Resource, info *ResourceInfo)) error
   146  	// Returns all top level resources (resources without owner references) of a specified namespace
   147  	GetNamespaceTopLevelResources(server *appv1.Cluster, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error)
   148  	// Starts watching resources of each controlled cluster.
   149  	Run(ctx context.Context) error
   150  	// Returns information about monitored clusters
   151  	GetClustersInfo() []clustercache.ClusterInfo
   152  	// Init must be executed before cache can be used
   153  	Init() error
   154  	// UpdateShard will update the shard of ClusterSharding when the shard has changed.
   155  	UpdateShard(shard int) bool
   156  }
   157  
   158  type ObjectUpdatedHandler = func(managedByApp map[string]bool, ref corev1.ObjectReference)
   159  
   160  type PodInfo struct {
   161  	NodeName         string
   162  	ResourceRequests corev1.ResourceList
   163  	Phase            corev1.PodPhase
   164  }
   165  
   166  type NodeInfo struct {
   167  	Name       string
   168  	Capacity   corev1.ResourceList
   169  	SystemInfo corev1.NodeSystemInfo
   170  	Labels     map[string]string
   171  }
   172  
   173  type ResourceInfo struct {
   174  	Info    []appv1.InfoItem
   175  	AppName string
   176  	Images  []string
   177  	Health  *health.HealthStatus
   178  	// NetworkingInfo are available only for known types involved into networking: Ingress, Service, Pod
   179  	NetworkingInfo *appv1.ResourceNetworkingInfo
   180  	// PodInfo is available for pods only
   181  	PodInfo *PodInfo
   182  	// NodeInfo is available for nodes only
   183  	NodeInfo *NodeInfo
   184  
   185  	manifestHash string
   186  }
   187  
   188  func NewLiveStateCache(
   189  	db db.ArgoDB,
   190  	appInformer cache.SharedIndexInformer,
   191  	settingsMgr *settings.SettingsManager,
   192  	metricsServer *metrics.MetricsServer,
   193  	onObjectUpdated ObjectUpdatedHandler,
   194  	clusterSharding sharding.ClusterShardingCache,
   195  	resourceTracking argo.ResourceTracking,
   196  ) LiveStateCache {
   197  	return &liveStateCache{
   198  		appInformer:      appInformer,
   199  		db:               db,
   200  		clusters:         make(map[string]clustercache.ClusterCache),
   201  		onObjectUpdated:  onObjectUpdated,
   202  		settingsMgr:      settingsMgr,
   203  		metricsServer:    metricsServer,
   204  		clusterSharding:  clusterSharding,
   205  		resourceTracking: resourceTracking,
   206  	}
   207  }
   208  
   209  type cacheSettings struct {
   210  	clusterSettings     clustercache.Settings
   211  	appInstanceLabelKey string
   212  	trackingMethod      appv1.TrackingMethod
   213  	installationID      string
   214  	// resourceOverrides provides a list of ignored differences to ignore watched resource updates
   215  	resourceOverrides map[string]appv1.ResourceOverride
   216  
   217  	// ignoreResourceUpdates is a flag to enable resource-ignore rules.
   218  	ignoreResourceUpdatesEnabled bool
   219  }
   220  
   221  type liveStateCache struct {
   222  	db                   db.ArgoDB
   223  	appInformer          cache.SharedIndexInformer
   224  	onObjectUpdated      ObjectUpdatedHandler
   225  	settingsMgr          *settings.SettingsManager
   226  	metricsServer        *metrics.MetricsServer
   227  	clusterSharding      sharding.ClusterShardingCache
   228  	resourceTracking     argo.ResourceTracking
   229  	ignoreNormalizerOpts normalizers.IgnoreNormalizerOpts
   230  
   231  	clusters      map[string]clustercache.ClusterCache
   232  	cacheSettings cacheSettings
   233  	lock          sync.RWMutex
   234  }
   235  
   236  func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) {
   237  	appInstanceLabelKey, err := c.settingsMgr.GetAppInstanceLabelKey()
   238  	if err != nil {
   239  		return nil, err
   240  	}
   241  	trackingMethod, err := c.settingsMgr.GetTrackingMethod()
   242  	if err != nil {
   243  		return nil, err
   244  	}
   245  	installationID, err := c.settingsMgr.GetInstallationID()
   246  	if err != nil {
   247  		return nil, err
   248  	}
   249  	resourceUpdatesOverrides, err := c.settingsMgr.GetIgnoreResourceUpdatesOverrides()
   250  	if err != nil {
   251  		return nil, err
   252  	}
   253  	ignoreResourceUpdatesEnabled, err := c.settingsMgr.GetIsIgnoreResourceUpdatesEnabled()
   254  	if err != nil {
   255  		return nil, err
   256  	}
   257  	resourcesFilter, err := c.settingsMgr.GetResourcesFilter()
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  	resourceOverrides, err := c.settingsMgr.GetResourceOverrides()
   262  	if err != nil {
   263  		return nil, err
   264  	}
   265  	clusterSettings := clustercache.Settings{
   266  		ResourceHealthOverride: lua.ResourceHealthOverrides(resourceOverrides),
   267  		ResourcesFilter:        resourcesFilter,
   268  	}
   269  
   270  	return &cacheSettings{clusterSettings, appInstanceLabelKey, appv1.TrackingMethod(trackingMethod), installationID, resourceUpdatesOverrides, ignoreResourceUpdatesEnabled}, nil
   271  }
   272  
   273  func asResourceNode(r *clustercache.Resource) appv1.ResourceNode {
   274  	gv, err := schema.ParseGroupVersion(r.Ref.APIVersion)
   275  	if err != nil {
   276  		gv = schema.GroupVersion{}
   277  	}
   278  	parentRefs := make([]appv1.ResourceRef, len(r.OwnerRefs))
   279  	for i, ownerRef := range r.OwnerRefs {
   280  		ownerGvk := schema.FromAPIVersionAndKind(ownerRef.APIVersion, ownerRef.Kind)
   281  		parentRefs[i] = appv1.ResourceRef{
   282  			Group:     ownerGvk.Group,
   283  			Kind:      ownerGvk.Kind,
   284  			Version:   ownerGvk.Version,
   285  			Namespace: r.Ref.Namespace,
   286  			Name:      ownerRef.Name,
   287  			UID:       string(ownerRef.UID),
   288  		}
   289  	}
   290  	var resHealth *appv1.HealthStatus
   291  	resourceInfo := resInfo(r)
   292  	if resourceInfo.Health != nil {
   293  		resHealth = &appv1.HealthStatus{Status: resourceInfo.Health.Status, Message: resourceInfo.Health.Message}
   294  	}
   295  	return appv1.ResourceNode{
   296  		ResourceRef: appv1.ResourceRef{
   297  			UID:       string(r.Ref.UID),
   298  			Name:      r.Ref.Name,
   299  			Group:     gv.Group,
   300  			Version:   gv.Version,
   301  			Kind:      r.Ref.Kind,
   302  			Namespace: r.Ref.Namespace,
   303  		},
   304  		ParentRefs:      parentRefs,
   305  		Info:            resourceInfo.Info,
   306  		ResourceVersion: r.ResourceVersion,
   307  		NetworkingInfo:  resourceInfo.NetworkingInfo,
   308  		Images:          resourceInfo.Images,
   309  		Health:          resHealth,
   310  		CreatedAt:       r.CreationTimestamp,
   311  	}
   312  }
   313  
   314  func resInfo(r *clustercache.Resource) *ResourceInfo {
   315  	info, ok := r.Info.(*ResourceInfo)
   316  	if !ok || info == nil {
   317  		info = &ResourceInfo{}
   318  	}
   319  	return info
   320  }
   321  
   322  func isRootAppNode(r *clustercache.Resource) bool {
   323  	return resInfo(r).AppName != "" && len(r.OwnerRefs) == 0
   324  }
   325  
   326  func getApp(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource) string {
   327  	name, _ := getAppRecursive(r, ns, map[kube.ResourceKey]bool{})
   328  	return name
   329  }
   330  
   331  func ownerRefGV(ownerRef metav1.OwnerReference) schema.GroupVersion {
   332  	gv, err := schema.ParseGroupVersion(ownerRef.APIVersion)
   333  	if err != nil {
   334  		gv = schema.GroupVersion{}
   335  	}
   336  	return gv
   337  }
   338  
   339  func getAppRecursive(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource, visited map[kube.ResourceKey]bool) (string, bool) {
   340  	if visited[r.ResourceKey()] {
   341  		log.Warnf("Circular dependency detected: %v.", visited)
   342  		return resInfo(r).AppName, false
   343  	}
   344  	visited[r.ResourceKey()] = true
   345  
   346  	if resInfo(r).AppName != "" {
   347  		return resInfo(r).AppName, true
   348  	}
   349  	for _, ownerRef := range r.OwnerRefs {
   350  		gv := ownerRefGV(ownerRef)
   351  		if parent, ok := ns[kube.NewResourceKey(gv.Group, ownerRef.Kind, r.Ref.Namespace, ownerRef.Name)]; ok {
   352  			visitedBranch := make(map[kube.ResourceKey]bool, len(visited))
   353  			for k, v := range visited {
   354  				visitedBranch[k] = v
   355  			}
   356  			app, ok := getAppRecursive(parent, ns, visitedBranch)
   357  			if app != "" || !ok {
   358  				return app, ok
   359  			}
   360  		}
   361  	}
   362  	return "", true
   363  }
   364  
   365  var ignoredRefreshResources = map[string]bool{
   366  	"/" + kube.EndpointsKind: true,
   367  }
   368  
   369  // skipAppRequeuing checks if the object is an API type which we want to skip requeuing against.
   370  // We ignore API types which have a high churn rate, and/or whose updates are irrelevant to the app
   371  func skipAppRequeuing(key kube.ResourceKey) bool {
   372  	return ignoredRefreshResources[key.Group+"/"+key.Kind]
   373  }
   374  
   375  func skipResourceUpdate(oldInfo, newInfo *ResourceInfo) bool {
   376  	if oldInfo == nil || newInfo == nil {
   377  		return false
   378  	}
   379  	isSameHealthStatus := (oldInfo.Health == nil && newInfo.Health == nil) || oldInfo.Health != nil && newInfo.Health != nil && oldInfo.Health.Status == newInfo.Health.Status
   380  	isSameManifest := oldInfo.manifestHash != "" && newInfo.manifestHash != "" && oldInfo.manifestHash == newInfo.manifestHash
   381  	return isSameHealthStatus && isSameManifest
   382  }
   383  
   384  // shouldHashManifest validates if the API resource needs to be hashed.
   385  // If there's an app name from resource tracking, or if this is itself an app, we should generate a hash.
   386  // Otherwise, the hashing should be skipped to save CPU time.
   387  func shouldHashManifest(appName string, gvk schema.GroupVersionKind, un *unstructured.Unstructured) bool {
   388  	// Only hash if the resource belongs to an app OR argocd.argoproj.io/ignore-resource-updates is present and set to true
   389  	// Best      - Only hash for resources that are part of an app or their dependencies
   390  	// (current) - Only hash for resources that are part of an app + all apps that might be from an ApplicationSet
   391  	// Orphan    - If orphan is enabled, hash should be made on all resource of that namespace and a config to disable it
   392  	// Worst     - Hash all resources watched by Argo
   393  	isTrackedResource := appName != "" || (gvk.Group == application.Group && gvk.Kind == application.ApplicationKind)
   394  
   395  	// If the resource is not a tracked resource, we will look up argocd.argoproj.io/ignore-resource-updates and decide
   396  	// whether we generate hash or not.
   397  	// If argocd.argoproj.io/ignore-resource-updates is presented and is true, return true
   398  	// Else return false
   399  	if !isTrackedResource {
   400  		if val, ok := un.GetAnnotations()[AnnotationIgnoreResourceUpdates]; ok {
   401  			applyResourcesUpdate, err := strconv.ParseBool(val)
   402  			if err != nil {
   403  				applyResourcesUpdate = false
   404  			}
   405  			return applyResourcesUpdate
   406  		}
   407  		return false
   408  	}
   409  
   410  	return isTrackedResource
   411  }
   412  
   413  // isRetryableError is a helper method to see whether an error
   414  // returned from the dynamic client is potentially retryable.
   415  func isRetryableError(err error) bool {
   416  	if err == nil {
   417  		return false
   418  	}
   419  	return apierrors.IsInternalError(err) ||
   420  		apierrors.IsInvalid(err) ||
   421  		apierrors.IsTooManyRequests(err) ||
   422  		apierrors.IsServerTimeout(err) ||
   423  		apierrors.IsServiceUnavailable(err) ||
   424  		apierrors.IsTimeout(err) ||
   425  		apierrors.IsUnexpectedObjectError(err) ||
   426  		apierrors.IsUnexpectedServerError(err) ||
   427  		isResourceQuotaConflictErr(err) ||
   428  		isTransientNetworkErr(err) ||
   429  		isExceededQuotaErr(err) ||
   430  		isHTTP2GoawayErr(err) ||
   431  		errors.Is(err, syscall.ECONNRESET)
   432  }
   433  
   434  func isHTTP2GoawayErr(err error) bool {
   435  	return strings.Contains(err.Error(), "http2: server sent GOAWAY and closed the connection")
   436  }
   437  
   438  func isExceededQuotaErr(err error) bool {
   439  	return apierrors.IsForbidden(err) && strings.Contains(err.Error(), "exceeded quota")
   440  }
   441  
   442  func isResourceQuotaConflictErr(err error) bool {
   443  	return apierrors.IsConflict(err) && strings.Contains(err.Error(), "Operation cannot be fulfilled on resourcequota")
   444  }
   445  
   446  func isTransientNetworkErr(err error) bool {
   447  	var netErr net.Error
   448  	if errors.As(err, &netErr) {
   449  		var dnsErr *net.DNSError
   450  		var opErr *net.OpError
   451  		var unknownNetworkErr net.UnknownNetworkError
   452  		var urlErr *url.Error
   453  		switch {
   454  		case errors.As(err, &dnsErr), errors.As(err, &opErr), errors.As(err, &unknownNetworkErr):
   455  			return true
   456  		case errors.As(err, &urlErr):
   457  			// For a URL error, where it replies "connection closed"
   458  			// retry again.
   459  			return strings.Contains(err.Error(), "Connection closed by foreign host")
   460  		}
   461  	}
   462  
   463  	errorString := err.Error()
   464  	var exitErr *exec.ExitError
   465  	if errors.As(err, &exitErr) {
   466  		errorString = fmt.Sprintf("%s %s", errorString, exitErr.Stderr)
   467  	}
   468  	if strings.Contains(errorString, "net/http: TLS handshake timeout") ||
   469  		strings.Contains(errorString, "i/o timeout") ||
   470  		strings.Contains(errorString, "connection timed out") ||
   471  		strings.Contains(errorString, "connection reset by peer") {
   472  		return true
   473  	}
   474  	return false
   475  }
   476  
   477  func (c *liveStateCache) getCluster(cluster *appv1.Cluster) (clustercache.ClusterCache, error) {
   478  	c.lock.RLock()
   479  	clusterCache, ok := c.clusters[cluster.Server]
   480  	cacheSettings := c.cacheSettings
   481  	c.lock.RUnlock()
   482  
   483  	if ok {
   484  		return clusterCache, nil
   485  	}
   486  
   487  	c.lock.Lock()
   488  	defer c.lock.Unlock()
   489  
   490  	clusterCache, ok = c.clusters[cluster.Server]
   491  	if ok {
   492  		return clusterCache, nil
   493  	}
   494  
   495  	if c.clusterSharding == nil {
   496  		return nil, fmt.Errorf("unable to handle cluster %s: cluster sharding is not configured", cluster.Server)
   497  	}
   498  
   499  	if !c.canHandleCluster(cluster) {
   500  		return nil, fmt.Errorf("controller is configured to ignore cluster %s", cluster.Server)
   501  	}
   502  
   503  	resourceCustomLabels, err := c.settingsMgr.GetResourceCustomLabels()
   504  	if err != nil {
   505  		return nil, fmt.Errorf("error getting custom label: %w", err)
   506  	}
   507  
   508  	respectRBAC, err := c.settingsMgr.RespectRBAC()
   509  	if err != nil {
   510  		return nil, fmt.Errorf("error getting value for %v: %w", settings.RespectRBAC, err)
   511  	}
   512  
   513  	clusterCacheConfig, err := cluster.RESTConfig()
   514  	if err != nil {
   515  		return nil, fmt.Errorf("error getting cluster RESTConfig: %w", err)
   516  	}
   517  	// Controller dynamically fetches all resource types available on the cluster
   518  	// using a discovery API that may contain deprecated APIs.
   519  	// This causes log flooding when managing a large number of clusters.
   520  	// https://github.com/argoproj/argo-cd/issues/11973
   521  	// However, we can safely suppress deprecation warnings
   522  	// because we do not rely on resources with a particular API group or version.
   523  	// https://kubernetes.io/blog/2020/09/03/warnings/#customize-client-handling
   524  	//
   525  	// Completely suppress warning logs only for log levels that are less than Debug.
   526  	if log.GetLevel() < log.DebugLevel {
   527  		clusterCacheConfig.WarningHandler = rest.NoWarnings{}
   528  	}
   529  
   530  	clusterCacheOpts := []clustercache.UpdateSettingsFunc{
   531  		clustercache.SetListSemaphore(semaphore.NewWeighted(clusterCacheListSemaphoreSize)),
   532  		clustercache.SetListPageSize(clusterCacheListPageSize),
   533  		clustercache.SetListPageBufferSize(clusterCacheListPageBufferSize),
   534  		clustercache.SetWatchResyncTimeout(clusterCacheWatchResyncDuration),
   535  		clustercache.SetClusterSyncRetryTimeout(clusterSyncRetryTimeoutDuration),
   536  		clustercache.SetResyncTimeout(clusterCacheResyncDuration),
   537  		clustercache.SetSettings(cacheSettings.clusterSettings),
   538  		clustercache.SetNamespaces(cluster.Namespaces),
   539  		clustercache.SetClusterResources(cluster.ClusterResources),
   540  		clustercache.SetPopulateResourceInfoHandler(func(un *unstructured.Unstructured, isRoot bool) (any, bool) {
   541  			res := &ResourceInfo{}
   542  			populateNodeInfo(un, res, resourceCustomLabels)
   543  			c.lock.RLock()
   544  			cacheSettings := c.cacheSettings
   545  			c.lock.RUnlock()
   546  
   547  			res.Health, _ = health.GetResourceHealth(un, cacheSettings.clusterSettings.ResourceHealthOverride)
   548  
   549  			appName := c.resourceTracking.GetAppName(un, cacheSettings.appInstanceLabelKey, cacheSettings.trackingMethod, cacheSettings.installationID)
   550  			if isRoot && appName != "" {
   551  				res.AppName = appName
   552  			}
   553  
   554  			gvk := un.GroupVersionKind()
   555  
   556  			if cacheSettings.ignoreResourceUpdatesEnabled && shouldHashManifest(appName, gvk, un) {
   557  				hash, err := generateManifestHash(un, nil, cacheSettings.resourceOverrides, c.ignoreNormalizerOpts)
   558  				if err != nil {
   559  					log.Errorf("Failed to generate manifest hash: %v", err)
   560  				} else {
   561  					res.manifestHash = hash
   562  				}
   563  			}
   564  
   565  			// edge case. we do not label CRDs, so they miss the tracking label we inject. But we still
   566  			// want the full resource to be available in our cache (to diff), so we store all CRDs
   567  			return res, res.AppName != "" || gvk.Kind == kube.CustomResourceDefinitionKind
   568  		}),
   569  		clustercache.SetLogr(logutils.NewLogrusLogger(log.WithField("server", cluster.Server))),
   570  		clustercache.SetRetryOptions(clusterCacheAttemptLimit, clusterCacheRetryUseBackoff, isRetryableError),
   571  		clustercache.SetRespectRBAC(respectRBAC),
   572  		clustercache.SetBatchEventsProcessing(clusterCacheBatchEventsProcessing),
   573  		clustercache.SetEventProcessingInterval(clusterCacheEventsProcessingInterval),
   574  	}
   575  
   576  	clusterCache = clustercache.NewClusterCache(clusterCacheConfig, clusterCacheOpts...)
   577  
   578  	_ = clusterCache.OnResourceUpdated(func(newRes *clustercache.Resource, oldRes *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) {
   579  		toNotify := make(map[string]bool)
   580  		var ref corev1.ObjectReference
   581  		if newRes != nil {
   582  			ref = newRes.Ref
   583  		} else {
   584  			ref = oldRes.Ref
   585  		}
   586  
   587  		c.lock.RLock()
   588  		cacheSettings := c.cacheSettings
   589  		c.lock.RUnlock()
   590  
   591  		if cacheSettings.ignoreResourceUpdatesEnabled && oldRes != nil && newRes != nil && skipResourceUpdate(resInfo(oldRes), resInfo(newRes)) {
   592  			// Additional check for debug level so we don't need to evaluate the
   593  			// format string in case of non-debug scenarios
   594  			if log.GetLevel() >= log.DebugLevel {
   595  				namespace := ref.Namespace
   596  				if ref.Namespace == "" {
   597  					namespace = "(cluster-scoped)"
   598  				}
   599  				log.WithFields(log.Fields{
   600  					"server":      cluster.Server,
   601  					"namespace":   namespace,
   602  					"name":        ref.Name,
   603  					"api-version": ref.APIVersion,
   604  					"kind":        ref.Kind,
   605  				}).Debug("Ignoring change of object because none of the watched resource fields have changed")
   606  			}
   607  			return
   608  		}
   609  
   610  		for _, r := range []*clustercache.Resource{newRes, oldRes} {
   611  			if r == nil {
   612  				continue
   613  			}
   614  			app := getApp(r, namespaceResources)
   615  			if app == "" || skipAppRequeuing(r.ResourceKey()) {
   616  				continue
   617  			}
   618  			toNotify[app] = isRootAppNode(r) || toNotify[app]
   619  		}
   620  		c.onObjectUpdated(toNotify, ref)
   621  	})
   622  
   623  	_ = clusterCache.OnEvent(func(_ watch.EventType, un *unstructured.Unstructured) {
   624  		gvk := un.GroupVersionKind()
   625  		c.metricsServer.IncClusterEventsCount(cluster.Server, gvk.Group, gvk.Kind)
   626  	})
   627  
   628  	_ = clusterCache.OnProcessEventsHandler(func(duration time.Duration, processedEventsNumber int) {
   629  		c.metricsServer.ObserveResourceEventsProcessingDuration(cluster.Server, duration, processedEventsNumber)
   630  	})
   631  
   632  	c.clusters[cluster.Server] = clusterCache
   633  
   634  	return clusterCache, nil
   635  }
   636  
   637  func (c *liveStateCache) getSyncedCluster(server *appv1.Cluster) (clustercache.ClusterCache, error) {
   638  	clusterCache, err := c.getCluster(server)
   639  	if err != nil {
   640  		return nil, fmt.Errorf("error getting cluster: %w", err)
   641  	}
   642  	err = clusterCache.EnsureSynced()
   643  	if err != nil {
   644  		return nil, fmt.Errorf("error synchronizing cache state : %w", err)
   645  	}
   646  	return clusterCache, nil
   647  }
   648  
   649  func (c *liveStateCache) invalidate(cacheSettings cacheSettings) {
   650  	log.Info("invalidating live state cache")
   651  	c.lock.Lock()
   652  	c.cacheSettings = cacheSettings
   653  	clusters := c.clusters
   654  	c.lock.Unlock()
   655  
   656  	for _, clust := range clusters {
   657  		clust.Invalidate(clustercache.SetSettings(cacheSettings.clusterSettings))
   658  	}
   659  	log.Info("live state cache invalidated")
   660  }
   661  
   662  func (c *liveStateCache) IsNamespaced(server *appv1.Cluster, gk schema.GroupKind) (bool, error) {
   663  	clusterInfo, err := c.getSyncedCluster(server)
   664  	if err != nil {
   665  		return false, err
   666  	}
   667  	return clusterInfo.IsNamespaced(gk)
   668  }
   669  
   670  func (c *liveStateCache) IterateHierarchyV2(server *appv1.Cluster, keys []kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error {
   671  	clusterInfo, err := c.getSyncedCluster(server)
   672  	if err != nil {
   673  		return err
   674  	}
   675  	clusterInfo.IterateHierarchyV2(keys, func(resource *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) bool {
   676  		return action(asResourceNode(resource), getApp(resource, namespaceResources))
   677  	})
   678  	return nil
   679  }
   680  
   681  func (c *liveStateCache) IterateResources(server *appv1.Cluster, callback func(res *clustercache.Resource, info *ResourceInfo)) error {
   682  	clusterInfo, err := c.getSyncedCluster(server)
   683  	if err != nil {
   684  		return err
   685  	}
   686  	_ = clusterInfo.FindResources("", func(r *clustercache.Resource) bool {
   687  		if info, ok := r.Info.(*ResourceInfo); ok {
   688  			callback(r, info)
   689  		}
   690  		return false
   691  	})
   692  	return nil
   693  }
   694  
   695  func (c *liveStateCache) GetNamespaceTopLevelResources(server *appv1.Cluster, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error) {
   696  	clusterInfo, err := c.getSyncedCluster(server)
   697  	if err != nil {
   698  		return nil, err
   699  	}
   700  	resources := clusterInfo.FindResources(namespace, clustercache.TopLevelResource)
   701  	res := make(map[kube.ResourceKey]appv1.ResourceNode)
   702  	for k, r := range resources {
   703  		res[k] = asResourceNode(r)
   704  	}
   705  	return res, nil
   706  }
   707  
   708  func (c *liveStateCache) GetManagedLiveObjs(destCluster *appv1.Cluster, a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error) {
   709  	clusterInfo, err := c.getSyncedCluster(destCluster)
   710  	if err != nil {
   711  		return nil, fmt.Errorf("failed to get cluster info for %q: %w", destCluster.Server, err)
   712  	}
   713  	return clusterInfo.GetManagedLiveObjs(targetObjs, func(r *clustercache.Resource) bool {
   714  		return resInfo(r).AppName == a.InstanceName(c.settingsMgr.GetNamespace())
   715  	})
   716  }
   717  
   718  func (c *liveStateCache) GetVersionsInfo(server *appv1.Cluster) (string, []kube.APIResourceInfo, error) {
   719  	clusterInfo, err := c.getSyncedCluster(server)
   720  	if err != nil {
   721  		return "", nil, fmt.Errorf("failed to get cluster info for %q: %w", server.Server, err)
   722  	}
   723  	return clusterInfo.GetServerVersion(), clusterInfo.GetAPIResources(), nil
   724  }
   725  
   726  func (c *liveStateCache) isClusterHasApps(apps []any, cluster *appv1.Cluster) bool {
   727  	for _, obj := range apps {
   728  		app, ok := obj.(*appv1.Application)
   729  		if !ok {
   730  			continue
   731  		}
   732  		destCluster, err := argo.GetDestinationCluster(context.Background(), app.Spec.Destination, c.db)
   733  		if err != nil {
   734  			log.Warnf("Failed to get destination cluster: %v", err)
   735  			continue
   736  		}
   737  		if destCluster.Server == cluster.Server {
   738  			return true
   739  		}
   740  	}
   741  	return false
   742  }
   743  
   744  func (c *liveStateCache) watchSettings(ctx context.Context) {
   745  	updateCh := make(chan *settings.ArgoCDSettings, 1)
   746  	c.settingsMgr.Subscribe(updateCh)
   747  
   748  	done := false
   749  	for !done {
   750  		select {
   751  		case <-updateCh:
   752  			nextCacheSettings, err := c.loadCacheSettings()
   753  			if err != nil {
   754  				log.Warnf("Failed to read updated settings: %v", err)
   755  				continue
   756  			}
   757  
   758  			c.lock.Lock()
   759  			needInvalidate := false
   760  			if !reflect.DeepEqual(c.cacheSettings, *nextCacheSettings) {
   761  				c.cacheSettings = *nextCacheSettings
   762  				needInvalidate = true
   763  			}
   764  			c.lock.Unlock()
   765  			if needInvalidate {
   766  				c.invalidate(*nextCacheSettings)
   767  			}
   768  		case <-ctx.Done():
   769  			done = true
   770  		}
   771  	}
   772  	log.Info("shutting down settings watch")
   773  	c.settingsMgr.Unsubscribe(updateCh)
   774  	close(updateCh)
   775  }
   776  
   777  func (c *liveStateCache) Init() error {
   778  	cacheSettings, err := c.loadCacheSettings()
   779  	if err != nil {
   780  		return fmt.Errorf("error loading cache settings: %w", err)
   781  	}
   782  	c.cacheSettings = *cacheSettings
   783  	return nil
   784  }
   785  
   786  // Run watches for resource changes annotated with application label on all registered clusters and schedule corresponding app refresh.
   787  func (c *liveStateCache) Run(ctx context.Context) error {
   788  	go c.watchSettings(ctx)
   789  
   790  	kube.RetryUntilSucceed(ctx, clustercache.ClusterRetryTimeout, "watch clusters", logutils.NewLogrusLogger(logutils.NewWithCurrentConfig()), func() error {
   791  		return c.db.WatchClusters(ctx, c.handleAddEvent, c.handleModEvent, c.handleDeleteEvent)
   792  	})
   793  
   794  	<-ctx.Done()
   795  	c.invalidate(c.cacheSettings)
   796  	return nil
   797  }
   798  
   799  func (c *liveStateCache) canHandleCluster(cluster *appv1.Cluster) bool {
   800  	return c.clusterSharding.IsManagedCluster(cluster)
   801  }
   802  
   803  func (c *liveStateCache) handleAddEvent(cluster *appv1.Cluster) {
   804  	c.clusterSharding.Add(cluster)
   805  	if !c.canHandleCluster(cluster) {
   806  		log.Infof("Ignoring cluster %s", cluster.Server)
   807  		return
   808  	}
   809  	c.lock.Lock()
   810  	_, ok := c.clusters[cluster.Server]
   811  	c.lock.Unlock()
   812  	if !ok {
   813  		log.Debugf("Checking if cache %v / cluster %v has appInformer %v", c, cluster, c.appInformer)
   814  		if c.appInformer == nil {
   815  			log.Warn("Cannot get a cluster appInformer. Cache may not be started this time")
   816  			return
   817  		}
   818  		if c.isClusterHasApps(c.appInformer.GetStore().List(), cluster) {
   819  			go func() {
   820  				// warm up cache for cluster with apps
   821  				_, _ = c.getSyncedCluster(cluster)
   822  			}()
   823  		}
   824  	}
   825  }
   826  
   827  func (c *liveStateCache) handleModEvent(oldCluster *appv1.Cluster, newCluster *appv1.Cluster) {
   828  	c.clusterSharding.Update(oldCluster, newCluster)
   829  	c.lock.Lock()
   830  	cluster, ok := c.clusters[newCluster.Server]
   831  	c.lock.Unlock()
   832  	if ok {
   833  		if !c.canHandleCluster(newCluster) {
   834  			cluster.Invalidate()
   835  			c.lock.Lock()
   836  			delete(c.clusters, newCluster.Server)
   837  			c.lock.Unlock()
   838  			return
   839  		}
   840  
   841  		var updateSettings []clustercache.UpdateSettingsFunc
   842  		if !reflect.DeepEqual(oldCluster.Config, newCluster.Config) {
   843  			newClusterRESTConfig, err := newCluster.RESTConfig()
   844  			if err == nil {
   845  				updateSettings = append(updateSettings, clustercache.SetConfig(newClusterRESTConfig))
   846  			} else {
   847  				log.Errorf("error getting cluster REST config: %v", err)
   848  			}
   849  		}
   850  		if !reflect.DeepEqual(oldCluster.Namespaces, newCluster.Namespaces) {
   851  			updateSettings = append(updateSettings, clustercache.SetNamespaces(newCluster.Namespaces))
   852  		}
   853  		if !reflect.DeepEqual(oldCluster.ClusterResources, newCluster.ClusterResources) {
   854  			updateSettings = append(updateSettings, clustercache.SetClusterResources(newCluster.ClusterResources))
   855  		}
   856  		forceInvalidate := false
   857  		if newCluster.RefreshRequestedAt != nil &&
   858  			cluster.GetClusterInfo().LastCacheSyncTime != nil &&
   859  			cluster.GetClusterInfo().LastCacheSyncTime.Before(newCluster.RefreshRequestedAt.Time) {
   860  			forceInvalidate = true
   861  		}
   862  
   863  		if len(updateSettings) > 0 || forceInvalidate {
   864  			cluster.Invalidate(updateSettings...)
   865  			go func() {
   866  				// warm up cluster cache
   867  				_ = cluster.EnsureSynced()
   868  			}()
   869  		}
   870  	}
   871  }
   872  
   873  func (c *liveStateCache) handleDeleteEvent(clusterServer string) {
   874  	c.lock.RLock()
   875  	c.clusterSharding.Delete(clusterServer)
   876  	cluster, ok := c.clusters[clusterServer]
   877  	c.lock.RUnlock()
   878  	if ok {
   879  		cluster.Invalidate()
   880  		c.lock.Lock()
   881  		delete(c.clusters, clusterServer)
   882  		c.lock.Unlock()
   883  	}
   884  }
   885  
   886  func (c *liveStateCache) GetClustersInfo() []clustercache.ClusterInfo {
   887  	clusters := make(map[string]clustercache.ClusterCache)
   888  	c.lock.RLock()
   889  	for k := range c.clusters {
   890  		clusters[k] = c.clusters[k]
   891  	}
   892  	c.lock.RUnlock()
   893  
   894  	res := make([]clustercache.ClusterInfo, 0)
   895  	for server, c := range clusters {
   896  		info := c.GetClusterInfo()
   897  		info.Server = server
   898  		res = append(res, info)
   899  	}
   900  	return res
   901  }
   902  
   903  func (c *liveStateCache) GetClusterCache(server *appv1.Cluster) (clustercache.ClusterCache, error) {
   904  	return c.getSyncedCluster(server)
   905  }
   906  
   907  // UpdateShard will update the shard of ClusterSharding when the shard has changed.
   908  func (c *liveStateCache) UpdateShard(shard int) bool {
   909  	return c.clusterSharding.UpdateShard(shard)
   910  }