github.com/argoproj-labs/argocd-operator@v0.10.0/controllers/argocd/prometheus.go (about)

     1  // Copyright 2019 ArgoCD Operator Developers
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  // 	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package argocd
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	monitoringv1 "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
    22  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    23  	"k8s.io/apimachinery/pkg/util/intstr"
    24  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    25  
    26  	argoproj "github.com/argoproj-labs/argocd-operator/api/v1beta1"
    27  	"github.com/argoproj-labs/argocd-operator/common"
    28  	"github.com/argoproj-labs/argocd-operator/controllers/argoutil"
    29  )
    30  
    31  var prometheusAPIFound = false
    32  
    33  // getPrometheusHost will return the hostname value for Prometheus.
    34  func getPrometheusHost(cr *argoproj.ArgoCD) string {
    35  	host := nameWithSuffix("prometheus", cr)
    36  	if len(cr.Spec.Prometheus.Host) > 0 {
    37  		host = cr.Spec.Prometheus.Host
    38  	}
    39  	return host
    40  }
    41  
    42  // getPrometheusSize will return the size value for the Prometheus replica count.
    43  func getPrometheusReplicas(cr *argoproj.ArgoCD) *int32 {
    44  	replicas := common.ArgoCDDefaultPrometheusReplicas
    45  	if cr.Spec.Prometheus.Size != nil {
    46  		if *cr.Spec.Prometheus.Size >= 0 && *cr.Spec.Prometheus.Size != replicas {
    47  			replicas = *cr.Spec.Prometheus.Size
    48  		}
    49  	}
    50  	return &replicas
    51  }
    52  
    53  // IsPrometheusAPIAvailable returns true if the Prometheus API is present.
    54  func IsPrometheusAPIAvailable() bool {
    55  	return prometheusAPIFound
    56  }
    57  
    58  // hasPrometheusSpecChanged will return true if the supported properties differs in the actual versus the desired state.
    59  func hasPrometheusSpecChanged(actual *monitoringv1.Prometheus, desired *argoproj.ArgoCD) bool {
    60  	// Replica count
    61  	if desired.Spec.Prometheus.Size != nil && *desired.Spec.Prometheus.Size >= 0 { // Valid replica count specified in desired state
    62  		if actual.Spec.Replicas != nil { // Actual replicas value is set
    63  			if *actual.Spec.Replicas != *desired.Spec.Prometheus.Size {
    64  				return true
    65  			}
    66  		} else if *desired.Spec.Prometheus.Size != common.ArgoCDDefaultPrometheusReplicas { // Actual replicas value is NOT set, but desired replicas differs from the default
    67  			return true
    68  		}
    69  	} else { // Replica count NOT specified in desired state
    70  		if actual.Spec.Replicas != nil && *actual.Spec.Replicas != common.ArgoCDDefaultPrometheusReplicas {
    71  			return true
    72  		}
    73  	}
    74  	return false
    75  }
    76  
    77  // verifyPrometheusAPI will verify that the Prometheus API is present.
    78  func verifyPrometheusAPI() error {
    79  	found, err := argoutil.VerifyAPI(monitoringv1.SchemeGroupVersion.Group, monitoringv1.SchemeGroupVersion.Version)
    80  	if err != nil {
    81  		return err
    82  	}
    83  	prometheusAPIFound = found
    84  	return nil
    85  }
    86  
    87  // newPrometheus returns a new Prometheus instance for the given ArgoCD.
    88  func newPrometheus(cr *argoproj.ArgoCD) *monitoringv1.Prometheus {
    89  	return &monitoringv1.Prometheus{
    90  		ObjectMeta: metav1.ObjectMeta{
    91  			Name:      cr.Name,
    92  			Namespace: cr.Namespace,
    93  			Labels:    argoutil.LabelsForCluster(cr),
    94  		},
    95  	}
    96  }
    97  
    98  // newServiceMonitor returns a new ServiceMonitor instance.
    99  func newServiceMonitor(cr *argoproj.ArgoCD) *monitoringv1.ServiceMonitor {
   100  	return &monitoringv1.ServiceMonitor{
   101  		ObjectMeta: metav1.ObjectMeta{
   102  			Name:      cr.Name,
   103  			Namespace: cr.Namespace,
   104  			Labels:    argoutil.LabelsForCluster(cr),
   105  		},
   106  	}
   107  }
   108  
   109  // newServiceMonitorWithName returns a new ServiceMonitor instance for the given ArgoCD using the given name.
   110  func newServiceMonitorWithName(name string, cr *argoproj.ArgoCD) *monitoringv1.ServiceMonitor {
   111  	svcmon := newServiceMonitor(cr)
   112  	svcmon.ObjectMeta.Name = name
   113  
   114  	lbls := svcmon.ObjectMeta.Labels
   115  	lbls[common.ArgoCDKeyName] = name
   116  	lbls[common.ArgoCDKeyRelease] = "prometheus-operator"
   117  	svcmon.ObjectMeta.Labels = lbls
   118  
   119  	return svcmon
   120  }
   121  
   122  // newServiceMonitorWithSuffix returns a new ServiceMonitor instance for the given ArgoCD using the given suffix.
   123  func newServiceMonitorWithSuffix(suffix string, cr *argoproj.ArgoCD) *monitoringv1.ServiceMonitor {
   124  	return newServiceMonitorWithName(fmt.Sprintf("%s-%s", cr.Name, suffix), cr)
   125  }
   126  
   127  // reconcileMetricsServiceMonitor will ensure that the ServiceMonitor is present for the ArgoCD metrics Service.
   128  func (r *ReconcileArgoCD) reconcileMetricsServiceMonitor(cr *argoproj.ArgoCD) error {
   129  	sm := newServiceMonitorWithSuffix(common.ArgoCDKeyMetrics, cr)
   130  	if argoutil.IsObjectFound(r.Client, cr.Namespace, sm.Name, sm) {
   131  		if !cr.Spec.Prometheus.Enabled {
   132  			// ServiceMonitor exists but enabled flag has been set to false, delete the ServiceMonitor
   133  			return r.Client.Delete(context.TODO(), sm)
   134  		}
   135  		return nil // ServiceMonitor found, do nothing
   136  	}
   137  
   138  	if !cr.Spec.Prometheus.Enabled {
   139  		return nil // Prometheus not enabled, do nothing.
   140  	}
   141  
   142  	sm.Spec.Selector = metav1.LabelSelector{
   143  		MatchLabels: map[string]string{
   144  			common.ArgoCDKeyName: nameWithSuffix(common.ArgoCDKeyMetrics, cr),
   145  		},
   146  	}
   147  	sm.Spec.Endpoints = []monitoringv1.Endpoint{
   148  		{
   149  			Port: common.ArgoCDKeyMetrics,
   150  		},
   151  	}
   152  
   153  	if err := controllerutil.SetControllerReference(cr, sm, r.Scheme); err != nil {
   154  		return err
   155  	}
   156  	return r.Client.Create(context.TODO(), sm)
   157  }
   158  
   159  // reconcilePrometheus will ensure that Prometheus is present for ArgoCD metrics.
   160  func (r *ReconcileArgoCD) reconcilePrometheus(cr *argoproj.ArgoCD) error {
   161  	prometheus := newPrometheus(cr)
   162  	if argoutil.IsObjectFound(r.Client, cr.Namespace, prometheus.Name, prometheus) {
   163  		if !cr.Spec.Prometheus.Enabled {
   164  			// Prometheus exists but enabled flag has been set to false, delete the Prometheus
   165  			return r.Client.Delete(context.TODO(), prometheus)
   166  		}
   167  		if hasPrometheusSpecChanged(prometheus, cr) {
   168  			prometheus.Spec.Replicas = cr.Spec.Prometheus.Size
   169  			return r.Client.Update(context.TODO(), prometheus)
   170  		}
   171  		return nil // Prometheus found, do nothing
   172  	}
   173  
   174  	if !cr.Spec.Prometheus.Enabled {
   175  		return nil // Prometheus not enabled, do nothing.
   176  	}
   177  
   178  	prometheus.Spec.Replicas = getPrometheusReplicas(cr)
   179  	prometheus.Spec.ServiceAccountName = "prometheus-k8s"
   180  	prometheus.Spec.ServiceMonitorSelector = &metav1.LabelSelector{}
   181  
   182  	if err := controllerutil.SetControllerReference(cr, prometheus, r.Scheme); err != nil {
   183  		return err
   184  	}
   185  	return r.Client.Create(context.TODO(), prometheus)
   186  }
   187  
   188  // reconcileRepoServerServiceMonitor will ensure that the ServiceMonitor is present for the Repo Server metrics Service.
   189  func (r *ReconcileArgoCD) reconcileRepoServerServiceMonitor(cr *argoproj.ArgoCD) error {
   190  	sm := newServiceMonitorWithSuffix("repo-server-metrics", cr)
   191  	if argoutil.IsObjectFound(r.Client, cr.Namespace, sm.Name, sm) {
   192  		if !cr.Spec.Prometheus.Enabled {
   193  			// ServiceMonitor exists but enabled flag has been set to false, delete the ServiceMonitor
   194  			return r.Client.Delete(context.TODO(), sm)
   195  		}
   196  		return nil // ServiceMonitor found, do nothing
   197  	}
   198  
   199  	if !cr.Spec.Prometheus.Enabled {
   200  		return nil // Prometheus not enabled, do nothing.
   201  	}
   202  
   203  	sm.Spec.Selector = metav1.LabelSelector{
   204  		MatchLabels: map[string]string{
   205  			common.ArgoCDKeyName: nameWithSuffix("repo-server", cr),
   206  		},
   207  	}
   208  	sm.Spec.Endpoints = []monitoringv1.Endpoint{
   209  		{
   210  			Port: common.ArgoCDKeyMetrics,
   211  		},
   212  	}
   213  
   214  	if err := controllerutil.SetControllerReference(cr, sm, r.Scheme); err != nil {
   215  		return err
   216  	}
   217  	return r.Client.Create(context.TODO(), sm)
   218  }
   219  
   220  // reconcileServerMetricsServiceMonitor will ensure that the ServiceMonitor is present for the ArgoCD Server metrics Service.
   221  func (r *ReconcileArgoCD) reconcileServerMetricsServiceMonitor(cr *argoproj.ArgoCD) error {
   222  	sm := newServiceMonitorWithSuffix("server-metrics", cr)
   223  	if argoutil.IsObjectFound(r.Client, cr.Namespace, sm.Name, sm) {
   224  		if !cr.Spec.Prometheus.Enabled {
   225  			// ServiceMonitor exists but enabled flag has been set to false, delete the ServiceMonitor
   226  			return r.Client.Delete(context.TODO(), sm)
   227  		}
   228  		return nil // ServiceMonitor found, do nothing
   229  	}
   230  
   231  	if !cr.Spec.Prometheus.Enabled {
   232  		return nil // Prometheus not enabled, do nothing.
   233  	}
   234  
   235  	sm.Spec.Selector = metav1.LabelSelector{
   236  		MatchLabels: map[string]string{
   237  			common.ArgoCDKeyName: nameWithSuffix("server-metrics", cr),
   238  		},
   239  	}
   240  	sm.Spec.Endpoints = []monitoringv1.Endpoint{
   241  		{
   242  			Port: common.ArgoCDKeyMetrics,
   243  		},
   244  	}
   245  
   246  	if err := controllerutil.SetControllerReference(cr, sm, r.Scheme); err != nil {
   247  		return err
   248  	}
   249  	return r.Client.Create(context.TODO(), sm)
   250  }
   251  
   252  // reconcilePrometheusRule reconciles the PrometheusRule that triggers alerts based on workload statuses
   253  func (r *ReconcileArgoCD) reconcilePrometheusRule(cr *argoproj.ArgoCD) error {
   254  
   255  	promRule := newPrometheusRule(cr.Namespace, "argocd-component-status-alert")
   256  
   257  	if argoutil.IsObjectFound(r.Client, cr.Namespace, promRule.Name, promRule) {
   258  
   259  		if !cr.Spec.Monitoring.Enabled {
   260  			// PrometheusRule exists but enabled flag has been set to false, delete the PrometheusRule
   261  			log.Info("instance monitoring disabled, deleting component status tracking prometheusRule")
   262  			return r.Client.Delete(context.TODO(), promRule)
   263  		}
   264  		return nil // PrometheusRule found, do nothing
   265  	}
   266  
   267  	if !cr.Spec.Monitoring.Enabled {
   268  		return nil // Monitoring not enabled, do nothing.
   269  	}
   270  
   271  	ruleGroups := []monitoringv1.RuleGroup{
   272  		{
   273  			Name: "ArgoCDComponentStatus",
   274  			Rules: []monitoringv1.Rule{
   275  				{
   276  					Alert: "ApplicationControllerNotReady",
   277  					Annotations: map[string]string{
   278  						"message": fmt.Sprintf("application controller deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   279  					},
   280  					Expr: intstr.IntOrString{
   281  						Type:   intstr.String,
   282  						StrVal: fmt.Sprintf("kube_statefulset_status_replicas{statefulset=\"%s\", namespace=\"%s\"} != kube_statefulset_status_replicas_ready{statefulset=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-application-controller"), cr.Namespace, fmt.Sprintf(cr.Name+"-application-controller"), cr.Namespace),
   283  					},
   284  					For: "1m",
   285  					Labels: map[string]string{
   286  						"severity": "critical",
   287  					},
   288  				},
   289  				{
   290  					Alert: "ServerNotReady",
   291  					Annotations: map[string]string{
   292  						"message": fmt.Sprintf("server deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   293  					},
   294  					Expr: intstr.IntOrString{
   295  						Type:   intstr.String,
   296  						StrVal: fmt.Sprintf("kube_deployment_status_replicas{deployment=\"%s\", namespace=\"%s\"} != kube_deployment_status_replicas_ready{deployment=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-server"), cr.Namespace, fmt.Sprintf(cr.Name+"-server"), cr.Namespace),
   297  					},
   298  					For: "1m",
   299  					Labels: map[string]string{
   300  						"severity": "critical",
   301  					},
   302  				},
   303  				{
   304  					Alert: "RepoServerNotReady",
   305  					Annotations: map[string]string{
   306  						"message": fmt.Sprintf("repo server deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   307  					},
   308  					Expr: intstr.IntOrString{
   309  						Type:   intstr.String,
   310  						StrVal: fmt.Sprintf("kube_deployment_status_replicas{deployment=\"%s\", namespace=\"%s\"} != kube_deployment_status_replicas_ready{deployment=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-repo-server"), cr.Namespace, fmt.Sprintf(cr.Name+"-repo-server"), cr.Namespace),
   311  					},
   312  					For: "1m",
   313  					Labels: map[string]string{
   314  						"severity": "critical",
   315  					},
   316  				},
   317  				{
   318  					Alert: "ApplicationSetControllerNotReady",
   319  					Annotations: map[string]string{
   320  						"message": fmt.Sprintf("applicationSet controller deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   321  					},
   322  					Expr: intstr.IntOrString{
   323  						Type:   intstr.String,
   324  						StrVal: fmt.Sprintf("kube_deployment_status_replicas{deployment=\"%s\", namespace=\"%s\"} != kube_deployment_status_replicas_ready{deployment=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-applicationset-controller"), cr.Namespace, fmt.Sprintf(cr.Name+"-applicationset-controller"), cr.Namespace),
   325  					},
   326  					For: "5m",
   327  					Labels: map[string]string{
   328  						"severity": "warning",
   329  					},
   330  				},
   331  				{
   332  					Alert: "DexNotReady",
   333  					Annotations: map[string]string{
   334  						"message": fmt.Sprintf("dex deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   335  					},
   336  					Expr: intstr.IntOrString{
   337  						Type:   intstr.String,
   338  						StrVal: fmt.Sprintf("kube_deployment_status_replicas{deployment=\"%s\", namespace=\"%s\"} != kube_deployment_status_replicas_ready{deployment=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-dex-server"), cr.Namespace, fmt.Sprintf(cr.Name+"-dex-server"), cr.Namespace),
   339  					},
   340  					For: "5m",
   341  					Labels: map[string]string{
   342  						"severity": "warning",
   343  					},
   344  				},
   345  				{
   346  					Alert: "NotificationsControllerNotReady",
   347  					Annotations: map[string]string{
   348  						"message": fmt.Sprintf("notifications controller deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   349  					},
   350  					Expr: intstr.IntOrString{
   351  						Type:   intstr.String,
   352  						StrVal: fmt.Sprintf("kube_deployment_status_replicas{deployment=\"%s\", namespace=\"%s\"} != kube_deployment_status_replicas_ready{deployment=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-notifications-controller"), cr.Namespace, fmt.Sprintf(cr.Name+"-notifications-controller"), cr.Namespace),
   353  					},
   354  					For: "5m",
   355  					Labels: map[string]string{
   356  						"severity": "warning",
   357  					},
   358  				},
   359  				{
   360  					Alert: "RedisNotReady",
   361  					Annotations: map[string]string{
   362  						"message": fmt.Sprintf("redis deployment for Argo CD instance in namespace %s is not running", cr.Namespace),
   363  					},
   364  					Expr: intstr.IntOrString{
   365  						Type:   intstr.String,
   366  						StrVal: fmt.Sprintf("kube_deployment_status_replicas{deployment=\"%s\", namespace=\"%s\"} != kube_deployment_status_replicas_ready{deployment=\"%s\", namespace=\"%s\"} ", fmt.Sprintf(cr.Name+"-redis"), cr.Namespace, fmt.Sprintf(cr.Name+"-redis"), cr.Namespace),
   367  					},
   368  					For: "5m",
   369  					Labels: map[string]string{
   370  						"severity": "warning",
   371  					},
   372  				},
   373  			},
   374  		},
   375  	}
   376  	promRule.Spec.Groups = ruleGroups
   377  
   378  	if err := controllerutil.SetControllerReference(cr, promRule, r.Scheme); err != nil {
   379  		return err
   380  	}
   381  
   382  	log.Info("instance monitoring enabled, creating component status tracking prometheusRule")
   383  	return r.Client.Create(context.TODO(), promRule) // Create PrometheusRule
   384  }
   385  
   386  // newPrometheusRule returns an empty PrometheusRule
   387  func newPrometheusRule(namespace, alertRuleName string) *monitoringv1.PrometheusRule {
   388  
   389  	promRule := &monitoringv1.PrometheusRule{
   390  		ObjectMeta: metav1.ObjectMeta{
   391  			Name:      alertRuleName,
   392  			Namespace: namespace,
   393  		},
   394  		Spec: monitoringv1.PrometheusRuleSpec{},
   395  	}
   396  	return promRule
   397  }