github.com/argoproj/argo-cd/v2@v2.10.9/controller/cache/cache.go (about)

     1  package cache
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  	"net"
     9  	"net/url"
    10  	"os/exec"
    11  	"reflect"
    12  	"strings"
    13  	"sync"
    14  	"syscall"
    15  	"time"
    16  
    17  	clustercache "github.com/argoproj/gitops-engine/pkg/cache"
    18  	"github.com/argoproj/gitops-engine/pkg/health"
    19  	"github.com/argoproj/gitops-engine/pkg/utils/kube"
    20  	log "github.com/sirupsen/logrus"
    21  	"golang.org/x/sync/semaphore"
    22  	v1 "k8s.io/api/core/v1"
    23  	kerrors "k8s.io/apimachinery/pkg/api/errors"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    26  	"k8s.io/apimachinery/pkg/runtime/schema"
    27  	"k8s.io/apimachinery/pkg/watch"
    28  	"k8s.io/client-go/rest"
    29  	"k8s.io/client-go/tools/cache"
    30  
    31  	"github.com/argoproj/argo-cd/v2/controller/metrics"
    32  	"github.com/argoproj/argo-cd/v2/controller/sharding"
    33  	"github.com/argoproj/argo-cd/v2/pkg/apis/application"
    34  	appv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1"
    35  	"github.com/argoproj/argo-cd/v2/util/argo"
    36  	"github.com/argoproj/argo-cd/v2/util/argo/normalizers"
    37  	"github.com/argoproj/argo-cd/v2/util/db"
    38  	"github.com/argoproj/argo-cd/v2/util/env"
    39  	logutils "github.com/argoproj/argo-cd/v2/util/log"
    40  	"github.com/argoproj/argo-cd/v2/util/lua"
    41  	"github.com/argoproj/argo-cd/v2/util/settings"
    42  )
    43  
    44  const (
    45  	// EnvClusterCacheResyncDuration is the env variable that holds cluster cache re-sync duration
    46  	EnvClusterCacheResyncDuration = "ARGOCD_CLUSTER_CACHE_RESYNC_DURATION"
    47  
    48  	// EnvClusterCacheWatchResyncDuration is the env variable that holds cluster cache watch re-sync duration
    49  	EnvClusterCacheWatchResyncDuration = "ARGOCD_CLUSTER_CACHE_WATCH_RESYNC_DURATION"
    50  
    51  	// EnvClusterSyncRetryTimeoutDuration is the env variable that holds cluster retry duration when sync error happens
    52  	EnvClusterSyncRetryTimeoutDuration = "ARGOCD_CLUSTER_SYNC_RETRY_TIMEOUT_DURATION"
    53  
    54  	// EnvClusterCacheListPageSize is the env variable to control size of the list page size when making K8s queries
    55  	EnvClusterCacheListPageSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_SIZE"
    56  
    57  	// EnvClusterCacheListPageBufferSize is the env variable to control the number of pages to buffer when making a K8s query to list resources
    58  	EnvClusterCacheListPageBufferSize = "ARGOCD_CLUSTER_CACHE_LIST_PAGE_BUFFER_SIZE"
    59  
    60  	// EnvClusterCacheListSemaphore is the env variable to control size of the list semaphore
    61  	// This is used to limit the number of concurrent memory consuming operations on the
    62  	// k8s list queries results across all clusters to avoid memory spikes during cache initialization.
    63  	EnvClusterCacheListSemaphore = "ARGOCD_CLUSTER_CACHE_LIST_SEMAPHORE"
    64  
    65  	// EnvClusterCacheAttemptLimit is the env variable to control the retry limit for listing resources during cluster cache sync
    66  	EnvClusterCacheAttemptLimit = "ARGOCD_CLUSTER_CACHE_ATTEMPT_LIMIT"
    67  
    68  	// EnvClusterCacheRetryUseBackoff is the env variable to control whether to use a backoff strategy with the retry during cluster cache sync
    69  	EnvClusterCacheRetryUseBackoff = "ARGOCD_CLUSTER_CACHE_RETRY_USE_BACKOFF"
    70  )
    71  
    72  // GitOps engine cluster cache tuning options
    73  var (
    74  	// clusterCacheResyncDuration controls the duration of cluster cache refresh.
    75  	// NOTE: this differs from gitops-engine default of 24h
    76  	clusterCacheResyncDuration = 12 * time.Hour
    77  
    78  	// clusterCacheWatchResyncDuration controls the maximum duration that group/kind watches are allowed to run
    79  	// for before relisting & restarting the watch
    80  	clusterCacheWatchResyncDuration = 10 * time.Minute
    81  
    82  	// clusterSyncRetryTimeoutDuration controls the sync retry duration when cluster sync error happens
    83  	clusterSyncRetryTimeoutDuration = 10 * time.Second
    84  
    85  	// The default limit of 50 is chosen based on experiments.
    86  	clusterCacheListSemaphoreSize int64 = 50
    87  
    88  	// clusterCacheListPageSize is the page size when performing K8s list requests.
    89  	// 500 is equal to kubectl's size
    90  	clusterCacheListPageSize int64 = 500
    91  
    92  	// clusterCacheListPageBufferSize is the number of pages to buffer when performing K8s list requests
    93  	clusterCacheListPageBufferSize int32 = 1
    94  
    95  	// clusterCacheRetryLimit sets a retry limit for failed requests during cluster cache sync
    96  	// If set to 1, retries are disabled.
    97  	clusterCacheAttemptLimit int32 = 1
    98  
    99  	// clusterCacheRetryUseBackoff specifies whether to use a backoff strategy on cluster cache sync, if retry is enabled
   100  	clusterCacheRetryUseBackoff bool = false
   101  )
   102  
   103  func init() {
   104  	clusterCacheResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheResyncDuration, clusterCacheResyncDuration, 0, math.MaxInt64)
   105  	clusterCacheWatchResyncDuration = env.ParseDurationFromEnv(EnvClusterCacheWatchResyncDuration, clusterCacheWatchResyncDuration, 0, math.MaxInt64)
   106  	clusterSyncRetryTimeoutDuration = env.ParseDurationFromEnv(EnvClusterSyncRetryTimeoutDuration, clusterSyncRetryTimeoutDuration, 0, math.MaxInt64)
   107  	clusterCacheListPageSize = env.ParseInt64FromEnv(EnvClusterCacheListPageSize, clusterCacheListPageSize, 0, math.MaxInt64)
   108  	clusterCacheListPageBufferSize = int32(env.ParseNumFromEnv(EnvClusterCacheListPageBufferSize, int(clusterCacheListPageBufferSize), 1, math.MaxInt32))
   109  	clusterCacheListSemaphoreSize = env.ParseInt64FromEnv(EnvClusterCacheListSemaphore, clusterCacheListSemaphoreSize, 0, math.MaxInt64)
   110  	clusterCacheAttemptLimit = int32(env.ParseNumFromEnv(EnvClusterCacheAttemptLimit, int(clusterCacheAttemptLimit), 1, math.MaxInt32))
   111  	clusterCacheRetryUseBackoff = env.ParseBoolFromEnv(EnvClusterCacheRetryUseBackoff, false)
   112  }
   113  
   114  type LiveStateCache interface {
   115  	// Returns k8s server version
   116  	GetVersionsInfo(serverURL string) (string, []kube.APIResourceInfo, error)
   117  	// Returns true of given group kind is a namespaced resource
   118  	IsNamespaced(server string, gk schema.GroupKind) (bool, error)
   119  	// Returns synced cluster cache
   120  	GetClusterCache(server string) (clustercache.ClusterCache, error)
   121  	// Executes give callback against resource specified by the key and all its children
   122  	IterateHierarchy(server string, key kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error
   123  	// Returns state of live nodes which correspond for target nodes of specified application.
   124  	GetManagedLiveObjs(a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error)
   125  	// IterateResources iterates all resource stored in cache
   126  	IterateResources(server string, callback func(res *clustercache.Resource, info *ResourceInfo)) error
   127  	// Returns all top level resources (resources without owner references) of a specified namespace
   128  	GetNamespaceTopLevelResources(server string, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error)
   129  	// Starts watching resources of each controlled cluster.
   130  	Run(ctx context.Context) error
   131  	// Returns information about monitored clusters
   132  	GetClustersInfo() []clustercache.ClusterInfo
   133  	// Init must be executed before cache can be used
   134  	Init() error
   135  }
   136  
   137  type ObjectUpdatedHandler = func(managedByApp map[string]bool, ref v1.ObjectReference)
   138  
   139  type PodInfo struct {
   140  	NodeName         string
   141  	ResourceRequests v1.ResourceList
   142  	Phase            v1.PodPhase
   143  }
   144  
   145  type NodeInfo struct {
   146  	Name       string
   147  	Capacity   v1.ResourceList
   148  	SystemInfo v1.NodeSystemInfo
   149  }
   150  
   151  type ResourceInfo struct {
   152  	Info    []appv1.InfoItem
   153  	AppName string
   154  	Images  []string
   155  	Health  *health.HealthStatus
   156  	// NetworkingInfo are available only for known types involved into networking: Ingress, Service, Pod
   157  	NetworkingInfo *appv1.ResourceNetworkingInfo
   158  	// PodInfo is available for pods only
   159  	PodInfo *PodInfo
   160  	// NodeInfo is available for nodes only
   161  	NodeInfo *NodeInfo
   162  
   163  	manifestHash string
   164  }
   165  
   166  func NewLiveStateCache(
   167  	db db.ArgoDB,
   168  	appInformer cache.SharedIndexInformer,
   169  	settingsMgr *settings.SettingsManager,
   170  	kubectl kube.Kubectl,
   171  	metricsServer *metrics.MetricsServer,
   172  	onObjectUpdated ObjectUpdatedHandler,
   173  	clusterSharding sharding.ClusterShardingCache,
   174  	resourceTracking argo.ResourceTracking) LiveStateCache {
   175  
   176  	return &liveStateCache{
   177  		appInformer:      appInformer,
   178  		db:               db,
   179  		clusters:         make(map[string]clustercache.ClusterCache),
   180  		onObjectUpdated:  onObjectUpdated,
   181  		kubectl:          kubectl,
   182  		settingsMgr:      settingsMgr,
   183  		metricsServer:    metricsServer,
   184  		clusterSharding:  clusterSharding,
   185  		resourceTracking: resourceTracking,
   186  	}
   187  }
   188  
   189  type cacheSettings struct {
   190  	clusterSettings     clustercache.Settings
   191  	appInstanceLabelKey string
   192  	trackingMethod      appv1.TrackingMethod
   193  	// resourceOverrides provides a list of ignored differences to ignore watched resource updates
   194  	resourceOverrides map[string]appv1.ResourceOverride
   195  
   196  	// ignoreResourceUpdates is a flag to enable resource-ignore rules.
   197  	ignoreResourceUpdatesEnabled bool
   198  }
   199  
   200  type liveStateCache struct {
   201  	db                   db.ArgoDB
   202  	appInformer          cache.SharedIndexInformer
   203  	onObjectUpdated      ObjectUpdatedHandler
   204  	kubectl              kube.Kubectl
   205  	settingsMgr          *settings.SettingsManager
   206  	metricsServer        *metrics.MetricsServer
   207  	clusterSharding      sharding.ClusterShardingCache
   208  	resourceTracking     argo.ResourceTracking
   209  	ignoreNormalizerOpts normalizers.IgnoreNormalizerOpts
   210  
   211  	clusters      map[string]clustercache.ClusterCache
   212  	cacheSettings cacheSettings
   213  	lock          sync.RWMutex
   214  }
   215  
   216  func (c *liveStateCache) loadCacheSettings() (*cacheSettings, error) {
   217  	appInstanceLabelKey, err := c.settingsMgr.GetAppInstanceLabelKey()
   218  	if err != nil {
   219  		return nil, err
   220  	}
   221  	resourceUpdatesOverrides, err := c.settingsMgr.GetIgnoreResourceUpdatesOverrides()
   222  	if err != nil {
   223  		return nil, err
   224  	}
   225  	ignoreResourceUpdatesEnabled, err := c.settingsMgr.GetIsIgnoreResourceUpdatesEnabled()
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  	resourcesFilter, err := c.settingsMgr.GetResourcesFilter()
   230  	if err != nil {
   231  		return nil, err
   232  	}
   233  	resourceOverrides, err := c.settingsMgr.GetResourceOverrides()
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  	clusterSettings := clustercache.Settings{
   238  		ResourceHealthOverride: lua.ResourceHealthOverrides(resourceOverrides),
   239  		ResourcesFilter:        resourcesFilter,
   240  	}
   241  
   242  	return &cacheSettings{clusterSettings, appInstanceLabelKey, argo.GetTrackingMethod(c.settingsMgr), resourceUpdatesOverrides, ignoreResourceUpdatesEnabled}, nil
   243  }
   244  
   245  func asResourceNode(r *clustercache.Resource) appv1.ResourceNode {
   246  	gv, err := schema.ParseGroupVersion(r.Ref.APIVersion)
   247  	if err != nil {
   248  		gv = schema.GroupVersion{}
   249  	}
   250  	parentRefs := make([]appv1.ResourceRef, len(r.OwnerRefs))
   251  	for i, ownerRef := range r.OwnerRefs {
   252  		ownerGvk := schema.FromAPIVersionAndKind(ownerRef.APIVersion, ownerRef.Kind)
   253  		ownerKey := kube.NewResourceKey(ownerGvk.Group, ownerRef.Kind, r.Ref.Namespace, ownerRef.Name)
   254  		parentRefs[i] = appv1.ResourceRef{Name: ownerRef.Name, Kind: ownerKey.Kind, Namespace: r.Ref.Namespace, Group: ownerKey.Group, UID: string(ownerRef.UID)}
   255  	}
   256  	var resHealth *appv1.HealthStatus
   257  	resourceInfo := resInfo(r)
   258  	if resourceInfo.Health != nil {
   259  		resHealth = &appv1.HealthStatus{Status: resourceInfo.Health.Status, Message: resourceInfo.Health.Message}
   260  	}
   261  	return appv1.ResourceNode{
   262  		ResourceRef: appv1.ResourceRef{
   263  			UID:       string(r.Ref.UID),
   264  			Name:      r.Ref.Name,
   265  			Group:     gv.Group,
   266  			Version:   gv.Version,
   267  			Kind:      r.Ref.Kind,
   268  			Namespace: r.Ref.Namespace,
   269  		},
   270  		ParentRefs:      parentRefs,
   271  		Info:            resourceInfo.Info,
   272  		ResourceVersion: r.ResourceVersion,
   273  		NetworkingInfo:  resourceInfo.NetworkingInfo,
   274  		Images:          resourceInfo.Images,
   275  		Health:          resHealth,
   276  		CreatedAt:       r.CreationTimestamp,
   277  	}
   278  }
   279  
   280  func resInfo(r *clustercache.Resource) *ResourceInfo {
   281  	info, ok := r.Info.(*ResourceInfo)
   282  	if !ok || info == nil {
   283  		info = &ResourceInfo{}
   284  	}
   285  	return info
   286  }
   287  
   288  func isRootAppNode(r *clustercache.Resource) bool {
   289  	return resInfo(r).AppName != "" && len(r.OwnerRefs) == 0
   290  }
   291  
   292  func getApp(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource) string {
   293  	return getAppRecursive(r, ns, map[kube.ResourceKey]bool{})
   294  }
   295  
   296  func ownerRefGV(ownerRef metav1.OwnerReference) schema.GroupVersion {
   297  	gv, err := schema.ParseGroupVersion(ownerRef.APIVersion)
   298  	if err != nil {
   299  		gv = schema.GroupVersion{}
   300  	}
   301  	return gv
   302  }
   303  
   304  func getAppRecursive(r *clustercache.Resource, ns map[kube.ResourceKey]*clustercache.Resource, visited map[kube.ResourceKey]bool) string {
   305  	if !visited[r.ResourceKey()] {
   306  		visited[r.ResourceKey()] = true
   307  	} else {
   308  		log.Warnf("Circular dependency detected: %v.", visited)
   309  		return resInfo(r).AppName
   310  	}
   311  
   312  	if resInfo(r).AppName != "" {
   313  		return resInfo(r).AppName
   314  	}
   315  	for _, ownerRef := range r.OwnerRefs {
   316  		gv := ownerRefGV(ownerRef)
   317  		if parent, ok := ns[kube.NewResourceKey(gv.Group, ownerRef.Kind, r.Ref.Namespace, ownerRef.Name)]; ok {
   318  			app := getAppRecursive(parent, ns, visited)
   319  			if app != "" {
   320  				return app
   321  			}
   322  		}
   323  	}
   324  	return ""
   325  }
   326  
   327  var (
   328  	ignoredRefreshResources = map[string]bool{
   329  		"/" + kube.EndpointsKind: true,
   330  	}
   331  )
   332  
   333  // skipAppRequeuing checks if the object is an API type which we want to skip requeuing against.
   334  // We ignore API types which have a high churn rate, and/or whose updates are irrelevant to the app
   335  func skipAppRequeuing(key kube.ResourceKey) bool {
   336  	return ignoredRefreshResources[key.Group+"/"+key.Kind]
   337  }
   338  
   339  func skipResourceUpdate(oldInfo, newInfo *ResourceInfo) bool {
   340  	if oldInfo == nil || newInfo == nil {
   341  		return false
   342  	}
   343  	isSameHealthStatus := (oldInfo.Health == nil && newInfo.Health == nil) || oldInfo.Health != nil && newInfo.Health != nil && oldInfo.Health.Status == newInfo.Health.Status
   344  	isSameManifest := oldInfo.manifestHash != "" && newInfo.manifestHash != "" && oldInfo.manifestHash == newInfo.manifestHash
   345  	return isSameHealthStatus && isSameManifest
   346  }
   347  
   348  // shouldHashManifest validates if the API resource needs to be hashed.
   349  // If there's an app name from resource tracking, or if this is itself an app, we should generate a hash.
   350  // Otherwise, the hashing should be skipped to save CPU time.
   351  func shouldHashManifest(appName string, gvk schema.GroupVersionKind) bool {
   352  	// Only hash if the resource belongs to an app.
   353  	// Best      - Only hash for resources that are part of an app or their dependencies
   354  	// (current) - Only hash for resources that are part of an app + all apps that might be from an ApplicationSet
   355  	// Orphan    - If orphan is enabled, hash should be made on all resource of that namespace and a config to disable it
   356  	// Worst     - Hash all resources watched by Argo
   357  	return appName != "" || (gvk.Group == application.Group && gvk.Kind == application.ApplicationKind)
   358  }
   359  
   360  // isRetryableError is a helper method to see whether an error
   361  // returned from the dynamic client is potentially retryable.
   362  func isRetryableError(err error) bool {
   363  	if err == nil {
   364  		return false
   365  	}
   366  	return kerrors.IsInternalError(err) ||
   367  		kerrors.IsInvalid(err) ||
   368  		kerrors.IsTooManyRequests(err) ||
   369  		kerrors.IsServerTimeout(err) ||
   370  		kerrors.IsServiceUnavailable(err) ||
   371  		kerrors.IsTimeout(err) ||
   372  		kerrors.IsUnexpectedObjectError(err) ||
   373  		kerrors.IsUnexpectedServerError(err) ||
   374  		isResourceQuotaConflictErr(err) ||
   375  		isTransientNetworkErr(err) ||
   376  		isExceededQuotaErr(err) ||
   377  		errors.Is(err, syscall.ECONNRESET)
   378  }
   379  
   380  func isExceededQuotaErr(err error) bool {
   381  	return kerrors.IsForbidden(err) && strings.Contains(err.Error(), "exceeded quota")
   382  }
   383  
   384  func isResourceQuotaConflictErr(err error) bool {
   385  	return kerrors.IsConflict(err) && strings.Contains(err.Error(), "Operation cannot be fulfilled on resourcequota")
   386  }
   387  
   388  func isTransientNetworkErr(err error) bool {
   389  	switch err.(type) {
   390  	case net.Error:
   391  		switch err.(type) {
   392  		case *net.DNSError, *net.OpError, net.UnknownNetworkError:
   393  			return true
   394  		case *url.Error:
   395  			// For a URL error, where it replies "connection closed"
   396  			// retry again.
   397  			return strings.Contains(err.Error(), "Connection closed by foreign host")
   398  		}
   399  	}
   400  
   401  	errorString := err.Error()
   402  	if exitErr, ok := err.(*exec.ExitError); ok {
   403  		errorString = fmt.Sprintf("%s %s", errorString, exitErr.Stderr)
   404  	}
   405  	if strings.Contains(errorString, "net/http: TLS handshake timeout") ||
   406  		strings.Contains(errorString, "i/o timeout") ||
   407  		strings.Contains(errorString, "connection timed out") ||
   408  		strings.Contains(errorString, "connection reset by peer") {
   409  		return true
   410  	}
   411  	return false
   412  }
   413  
   414  func (c *liveStateCache) getCluster(server string) (clustercache.ClusterCache, error) {
   415  	c.lock.RLock()
   416  	clusterCache, ok := c.clusters[server]
   417  	cacheSettings := c.cacheSettings
   418  	c.lock.RUnlock()
   419  
   420  	if ok {
   421  		return clusterCache, nil
   422  	}
   423  
   424  	c.lock.Lock()
   425  	defer c.lock.Unlock()
   426  
   427  	clusterCache, ok = c.clusters[server]
   428  	if ok {
   429  		return clusterCache, nil
   430  	}
   431  
   432  	cluster, err := c.db.GetCluster(context.Background(), server)
   433  	if err != nil {
   434  		return nil, fmt.Errorf("error getting cluster: %w", err)
   435  	}
   436  
   437  	if !c.canHandleCluster(cluster) {
   438  		return nil, fmt.Errorf("controller is configured to ignore cluster %s", cluster.Server)
   439  	}
   440  
   441  	resourceCustomLabels, err := c.settingsMgr.GetResourceCustomLabels()
   442  	if err != nil {
   443  		return nil, fmt.Errorf("error getting custom label: %w", err)
   444  	}
   445  
   446  	respectRBAC, err := c.settingsMgr.RespectRBAC()
   447  	if err != nil {
   448  		return nil, fmt.Errorf("error getting value for %v: %w", settings.RespectRBAC, err)
   449  	}
   450  
   451  	clusterCacheConfig := cluster.RESTConfig()
   452  	// Controller dynamically fetches all resource types available on the cluster
   453  	// using a discovery API that may contain deprecated APIs.
   454  	// This causes log flooding when managing a large number of clusters.
   455  	// https://github.com/argoproj/argo-cd/issues/11973
   456  	// However, we can safely suppress deprecation warnings
   457  	// because we do not rely on resources with a particular API group or version.
   458  	// https://kubernetes.io/blog/2020/09/03/warnings/#customize-client-handling
   459  	//
   460  	// Completely suppress warning logs only for log levels that are less than Debug.
   461  	if log.GetLevel() < log.DebugLevel {
   462  		clusterCacheConfig.WarningHandler = rest.NoWarnings{}
   463  	}
   464  
   465  	clusterCacheOpts := []clustercache.UpdateSettingsFunc{
   466  		clustercache.SetListSemaphore(semaphore.NewWeighted(clusterCacheListSemaphoreSize)),
   467  		clustercache.SetListPageSize(clusterCacheListPageSize),
   468  		clustercache.SetListPageBufferSize(clusterCacheListPageBufferSize),
   469  		clustercache.SetWatchResyncTimeout(clusterCacheWatchResyncDuration),
   470  		clustercache.SetClusterSyncRetryTimeout(clusterSyncRetryTimeoutDuration),
   471  		clustercache.SetResyncTimeout(clusterCacheResyncDuration),
   472  		clustercache.SetSettings(cacheSettings.clusterSettings),
   473  		clustercache.SetNamespaces(cluster.Namespaces),
   474  		clustercache.SetClusterResources(cluster.ClusterResources),
   475  		clustercache.SetPopulateResourceInfoHandler(func(un *unstructured.Unstructured, isRoot bool) (interface{}, bool) {
   476  			res := &ResourceInfo{}
   477  			populateNodeInfo(un, res, resourceCustomLabels)
   478  			c.lock.RLock()
   479  			cacheSettings := c.cacheSettings
   480  			c.lock.RUnlock()
   481  
   482  			res.Health, _ = health.GetResourceHealth(un, cacheSettings.clusterSettings.ResourceHealthOverride)
   483  
   484  			appName := c.resourceTracking.GetAppName(un, cacheSettings.appInstanceLabelKey, cacheSettings.trackingMethod)
   485  			if isRoot && appName != "" {
   486  				res.AppName = appName
   487  			}
   488  
   489  			gvk := un.GroupVersionKind()
   490  
   491  			if cacheSettings.ignoreResourceUpdatesEnabled && shouldHashManifest(appName, gvk) {
   492  				hash, err := generateManifestHash(un, nil, cacheSettings.resourceOverrides, c.ignoreNormalizerOpts)
   493  				if err != nil {
   494  					log.Errorf("Failed to generate manifest hash: %v", err)
   495  				} else {
   496  					res.manifestHash = hash
   497  				}
   498  			}
   499  
   500  			// edge case. we do not label CRDs, so they miss the tracking label we inject. But we still
   501  			// want the full resource to be available in our cache (to diff), so we store all CRDs
   502  			return res, res.AppName != "" || gvk.Kind == kube.CustomResourceDefinitionKind
   503  		}),
   504  		clustercache.SetLogr(logutils.NewLogrusLogger(log.WithField("server", cluster.Server))),
   505  		clustercache.SetRetryOptions(clusterCacheAttemptLimit, clusterCacheRetryUseBackoff, isRetryableError),
   506  		clustercache.SetRespectRBAC(respectRBAC),
   507  	}
   508  
   509  	clusterCache = clustercache.NewClusterCache(clusterCacheConfig, clusterCacheOpts...)
   510  
   511  	_ = clusterCache.OnResourceUpdated(func(newRes *clustercache.Resource, oldRes *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) {
   512  		toNotify := make(map[string]bool)
   513  		var ref v1.ObjectReference
   514  		if newRes != nil {
   515  			ref = newRes.Ref
   516  		} else {
   517  			ref = oldRes.Ref
   518  		}
   519  
   520  		c.lock.RLock()
   521  		cacheSettings := c.cacheSettings
   522  		c.lock.RUnlock()
   523  
   524  		if cacheSettings.ignoreResourceUpdatesEnabled && oldRes != nil && newRes != nil && skipResourceUpdate(resInfo(oldRes), resInfo(newRes)) {
   525  			// Additional check for debug level so we don't need to evaluate the
   526  			// format string in case of non-debug scenarios
   527  			if log.GetLevel() >= log.DebugLevel {
   528  				namespace := ref.Namespace
   529  				if ref.Namespace == "" {
   530  					namespace = "(cluster-scoped)"
   531  				}
   532  				log.WithFields(log.Fields{
   533  					"server":      cluster.Server,
   534  					"namespace":   namespace,
   535  					"name":        ref.Name,
   536  					"api-version": ref.APIVersion,
   537  					"kind":        ref.Kind,
   538  				}).Debug("Ignoring change of object because none of the watched resource fields have changed")
   539  			}
   540  			return
   541  		}
   542  
   543  		for _, r := range []*clustercache.Resource{newRes, oldRes} {
   544  			if r == nil {
   545  				continue
   546  			}
   547  			app := getApp(r, namespaceResources)
   548  			if app == "" || skipAppRequeuing(r.ResourceKey()) {
   549  				continue
   550  			}
   551  			toNotify[app] = isRootAppNode(r) || toNotify[app]
   552  		}
   553  		c.onObjectUpdated(toNotify, ref)
   554  	})
   555  
   556  	_ = clusterCache.OnEvent(func(event watch.EventType, un *unstructured.Unstructured) {
   557  		gvk := un.GroupVersionKind()
   558  		c.metricsServer.IncClusterEventsCount(cluster.Server, gvk.Group, gvk.Kind)
   559  	})
   560  
   561  	c.clusters[server] = clusterCache
   562  
   563  	return clusterCache, nil
   564  }
   565  
   566  func (c *liveStateCache) getSyncedCluster(server string) (clustercache.ClusterCache, error) {
   567  	clusterCache, err := c.getCluster(server)
   568  	if err != nil {
   569  		return nil, fmt.Errorf("error getting cluster: %w", err)
   570  	}
   571  	err = clusterCache.EnsureSynced()
   572  	if err != nil {
   573  		return nil, fmt.Errorf("error synchronizing cache state : %w", err)
   574  	}
   575  	return clusterCache, nil
   576  }
   577  
   578  func (c *liveStateCache) invalidate(cacheSettings cacheSettings) {
   579  	log.Info("invalidating live state cache")
   580  	c.lock.Lock()
   581  	c.cacheSettings = cacheSettings
   582  	clusters := c.clusters
   583  	c.lock.Unlock()
   584  
   585  	for _, clust := range clusters {
   586  		clust.Invalidate(clustercache.SetSettings(cacheSettings.clusterSettings))
   587  	}
   588  	log.Info("live state cache invalidated")
   589  }
   590  
   591  func (c *liveStateCache) IsNamespaced(server string, gk schema.GroupKind) (bool, error) {
   592  	clusterInfo, err := c.getSyncedCluster(server)
   593  	if err != nil {
   594  		return false, err
   595  	}
   596  	return clusterInfo.IsNamespaced(gk)
   597  }
   598  
   599  func (c *liveStateCache) IterateHierarchy(server string, key kube.ResourceKey, action func(child appv1.ResourceNode, appName string) bool) error {
   600  	clusterInfo, err := c.getSyncedCluster(server)
   601  	if err != nil {
   602  		return err
   603  	}
   604  	clusterInfo.IterateHierarchy(key, func(resource *clustercache.Resource, namespaceResources map[kube.ResourceKey]*clustercache.Resource) bool {
   605  		return action(asResourceNode(resource), getApp(resource, namespaceResources))
   606  	})
   607  	return nil
   608  }
   609  
   610  func (c *liveStateCache) IterateResources(server string, callback func(res *clustercache.Resource, info *ResourceInfo)) error {
   611  	clusterInfo, err := c.getSyncedCluster(server)
   612  	if err != nil {
   613  		return err
   614  	}
   615  	_ = clusterInfo.FindResources("", func(r *clustercache.Resource) bool {
   616  		if info, ok := r.Info.(*ResourceInfo); ok {
   617  			callback(r, info)
   618  		}
   619  		return false
   620  	})
   621  	return nil
   622  }
   623  
   624  func (c *liveStateCache) GetNamespaceTopLevelResources(server string, namespace string) (map[kube.ResourceKey]appv1.ResourceNode, error) {
   625  	clusterInfo, err := c.getSyncedCluster(server)
   626  	if err != nil {
   627  		return nil, err
   628  	}
   629  	resources := clusterInfo.FindResources(namespace, clustercache.TopLevelResource)
   630  	res := make(map[kube.ResourceKey]appv1.ResourceNode)
   631  	for k, r := range resources {
   632  		res[k] = asResourceNode(r)
   633  	}
   634  	return res, nil
   635  }
   636  
   637  func (c *liveStateCache) GetManagedLiveObjs(a *appv1.Application, targetObjs []*unstructured.Unstructured) (map[kube.ResourceKey]*unstructured.Unstructured, error) {
   638  	clusterInfo, err := c.getSyncedCluster(a.Spec.Destination.Server)
   639  	if err != nil {
   640  		return nil, fmt.Errorf("failed to get cluster info for %q: %w", a.Spec.Destination.Server, err)
   641  	}
   642  	return clusterInfo.GetManagedLiveObjs(targetObjs, func(r *clustercache.Resource) bool {
   643  		return resInfo(r).AppName == a.InstanceName(c.settingsMgr.GetNamespace())
   644  	})
   645  }
   646  
   647  func (c *liveStateCache) GetVersionsInfo(serverURL string) (string, []kube.APIResourceInfo, error) {
   648  	clusterInfo, err := c.getSyncedCluster(serverURL)
   649  	if err != nil {
   650  		return "", nil, fmt.Errorf("failed to get cluster info for %q: %w", serverURL, err)
   651  	}
   652  	return clusterInfo.GetServerVersion(), clusterInfo.GetAPIResources(), nil
   653  }
   654  
   655  func (c *liveStateCache) isClusterHasApps(apps []interface{}, cluster *appv1.Cluster) bool {
   656  	for _, obj := range apps {
   657  		app, ok := obj.(*appv1.Application)
   658  		if !ok {
   659  			continue
   660  		}
   661  		err := argo.ValidateDestination(context.Background(), &app.Spec.Destination, c.db)
   662  		if err != nil {
   663  			continue
   664  		}
   665  		if app.Spec.Destination.Server == cluster.Server {
   666  			return true
   667  		}
   668  	}
   669  	return false
   670  }
   671  
   672  func (c *liveStateCache) watchSettings(ctx context.Context) {
   673  	updateCh := make(chan *settings.ArgoCDSettings, 1)
   674  	c.settingsMgr.Subscribe(updateCh)
   675  
   676  	done := false
   677  	for !done {
   678  		select {
   679  		case <-updateCh:
   680  			nextCacheSettings, err := c.loadCacheSettings()
   681  			if err != nil {
   682  				log.Warnf("Failed to read updated settings: %v", err)
   683  				continue
   684  			}
   685  
   686  			c.lock.Lock()
   687  			needInvalidate := false
   688  			if !reflect.DeepEqual(c.cacheSettings, *nextCacheSettings) {
   689  				c.cacheSettings = *nextCacheSettings
   690  				needInvalidate = true
   691  			}
   692  			c.lock.Unlock()
   693  			if needInvalidate {
   694  				c.invalidate(*nextCacheSettings)
   695  			}
   696  		case <-ctx.Done():
   697  			done = true
   698  		}
   699  	}
   700  	log.Info("shutting down settings watch")
   701  	c.settingsMgr.Unsubscribe(updateCh)
   702  	close(updateCh)
   703  }
   704  
   705  func (c *liveStateCache) Init() error {
   706  	cacheSettings, err := c.loadCacheSettings()
   707  	if err != nil {
   708  		return fmt.Errorf("error loading cache settings: %w", err)
   709  	}
   710  	c.cacheSettings = *cacheSettings
   711  	return nil
   712  }
   713  
   714  // Run watches for resource changes annotated with application label on all registered clusters and schedule corresponding app refresh.
   715  func (c *liveStateCache) Run(ctx context.Context) error {
   716  	go c.watchSettings(ctx)
   717  
   718  	kube.RetryUntilSucceed(ctx, clustercache.ClusterRetryTimeout, "watch clusters", logutils.NewLogrusLogger(logutils.NewWithCurrentConfig()), func() error {
   719  		return c.db.WatchClusters(ctx, c.handleAddEvent, c.handleModEvent, c.handleDeleteEvent)
   720  	})
   721  
   722  	<-ctx.Done()
   723  	c.invalidate(c.cacheSettings)
   724  	return nil
   725  }
   726  
   727  func (c *liveStateCache) canHandleCluster(cluster *appv1.Cluster) bool {
   728  	return c.clusterSharding.IsManagedCluster(cluster)
   729  }
   730  
   731  func (c *liveStateCache) handleAddEvent(cluster *appv1.Cluster) {
   732  	c.clusterSharding.Add(cluster)
   733  	if !c.canHandleCluster(cluster) {
   734  		log.Infof("Ignoring cluster %s", cluster.Server)
   735  		return
   736  	}
   737  	c.lock.Lock()
   738  	_, ok := c.clusters[cluster.Server]
   739  	c.lock.Unlock()
   740  	if !ok {
   741  		log.Debugf("Checking if cache %v / cluster %v has appInformer %v", c, cluster, c.appInformer)
   742  		if c.appInformer == nil {
   743  			log.Warn("Cannot get a cluster appInformer. Cache may not be started this time")
   744  			return
   745  		}
   746  		if c.isClusterHasApps(c.appInformer.GetStore().List(), cluster) {
   747  			go func() {
   748  				// warm up cache for cluster with apps
   749  				_, _ = c.getSyncedCluster(cluster.Server)
   750  			}()
   751  		}
   752  	}
   753  }
   754  
   755  func (c *liveStateCache) handleModEvent(oldCluster *appv1.Cluster, newCluster *appv1.Cluster) {
   756  	c.clusterSharding.Update(oldCluster, newCluster)
   757  	c.lock.Lock()
   758  	cluster, ok := c.clusters[newCluster.Server]
   759  	c.lock.Unlock()
   760  	if ok {
   761  		if !c.canHandleCluster(newCluster) {
   762  			cluster.Invalidate()
   763  			c.lock.Lock()
   764  			delete(c.clusters, newCluster.Server)
   765  			c.lock.Unlock()
   766  			return
   767  		}
   768  
   769  		var updateSettings []clustercache.UpdateSettingsFunc
   770  		if !reflect.DeepEqual(oldCluster.Config, newCluster.Config) {
   771  			updateSettings = append(updateSettings, clustercache.SetConfig(newCluster.RESTConfig()))
   772  		}
   773  		if !reflect.DeepEqual(oldCluster.Namespaces, newCluster.Namespaces) {
   774  			updateSettings = append(updateSettings, clustercache.SetNamespaces(newCluster.Namespaces))
   775  		}
   776  		if !reflect.DeepEqual(oldCluster.ClusterResources, newCluster.ClusterResources) {
   777  			updateSettings = append(updateSettings, clustercache.SetClusterResources(newCluster.ClusterResources))
   778  		}
   779  		forceInvalidate := false
   780  		if newCluster.RefreshRequestedAt != nil &&
   781  			cluster.GetClusterInfo().LastCacheSyncTime != nil &&
   782  			cluster.GetClusterInfo().LastCacheSyncTime.Before(newCluster.RefreshRequestedAt.Time) {
   783  			forceInvalidate = true
   784  		}
   785  
   786  		if len(updateSettings) > 0 || forceInvalidate {
   787  			cluster.Invalidate(updateSettings...)
   788  			go func() {
   789  				// warm up cluster cache
   790  				_ = cluster.EnsureSynced()
   791  			}()
   792  		}
   793  	}
   794  
   795  }
   796  
   797  func (c *liveStateCache) handleDeleteEvent(clusterServer string) {
   798  	c.lock.RLock()
   799  	c.clusterSharding.Delete(clusterServer)
   800  	cluster, ok := c.clusters[clusterServer]
   801  	c.lock.RUnlock()
   802  	if ok {
   803  		cluster.Invalidate()
   804  		c.lock.Lock()
   805  		delete(c.clusters, clusterServer)
   806  		c.lock.Unlock()
   807  	}
   808  }
   809  
   810  func (c *liveStateCache) GetClustersInfo() []clustercache.ClusterInfo {
   811  	clusters := make(map[string]clustercache.ClusterCache)
   812  	c.lock.RLock()
   813  	for k := range c.clusters {
   814  		clusters[k] = c.clusters[k]
   815  	}
   816  	c.lock.RUnlock()
   817  
   818  	res := make([]clustercache.ClusterInfo, 0)
   819  	for server, c := range clusters {
   820  		info := c.GetClusterInfo()
   821  		info.Server = server
   822  		res = append(res, info)
   823  	}
   824  	return res
   825  }
   826  
   827  func (c *liveStateCache) GetClusterCache(server string) (clustercache.ClusterCache, error) {
   828  	return c.getSyncedCluster(server)
   829  }