github.com/spotahome/redis-operator@v1.2.4/metrics/metrics.go (about)

     1  package metrics
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/prometheus/client_golang/prometheus"
    10  	koopercontroller "github.com/spotahome/kooper/v2/controller"
    11  	kooperprometheus "github.com/spotahome/kooper/v2/metrics/prometheus"
    12  	"github.com/spotahome/redis-operator/log"
    13  )
    14  
    15  const (
    16  	promControllerSubsystem  = "controller"
    17  	metricsGCIntervalMinutes = 5
    18  )
    19  
    20  func init() {
    21  	go removeStaleMetrics()
    22  }
    23  
    24  // variables for setting various indicator labels
    25  const (
    26  	SUCCESS                                = "SUCCESS"
    27  	FAIL                                   = "FAIL"
    28  	STATUS_HEALTHY                         = "HEALTHY"
    29  	STATUS_UNHEALTHY                       = "UNHEALTHY"
    30  	NOT_APPLICABLE                         = "NA"
    31  	UNHEALTHY                              = 1.0
    32  	HEALTHY                                = 0.0
    33  	REDIS_REPLICA_MISMATCH                 = "REDIS_STATEFULSET_REPLICAS_MISMATCH"
    34  	SENTINEL_REPLICA_MISMATCH              = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH"
    35  	NO_MASTER                              = "NO_MASTER_AVAILABLE"
    36  	NUMBER_OF_MASTERS                      = "MASTER_COUNT_IS_NOT_ONE"
    37  	SENTINEL_WRONG_MASTER                  = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP"
    38  	SLAVE_WRONG_MASTER                     = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP"
    39  	SENTINEL_NOT_READY                     = "SENTINEL_NOT_READY"
    40  	REGEX_NOT_FOUND                        = "SENTINEL_REGEX_NOT_FOUND"
    41  	MISC                                   = "MISC_ERROR"
    42  	SENTINEL_NUMBER_IN_MEMORY_MISMATCH     = "SENTINEL_NUMBER_IN_MEMORY_MISMATCH"
    43  	REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH = "REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH"
    44  	// redis connection related errors
    45  	WRONG_PASSWORD_USED = "WRONG_PASSWORD_USED"
    46  	NOAUTH              = "AUTH_CREDENTIALS_NOT_PROVIDED"
    47  	NOPERM              = "REDIS_USER_DOES_NOT_HAVE_PERMISSIONS"
    48  	IO_TIMEOUT          = "CONNECTION_TIMEDOUT"
    49  	CONNECTION_REFUSED  = "CONNECTION_REFUSED"
    50  
    51  	K8S_FORBIDDEN_ERR = "USER_FORBIDDEN_TO_PERFORM_ACTION"
    52  	K8S_UNAUTH        = "CLIENT_NOT_AUTHORISED"
    53  	K8S_MISC          = "MISC_ERROR_CHECK_LOGS"
    54  	K8S_NOT_FOUND     = "RESOURCE_NOT_FOUND"
    55  
    56  	KIND_REDIS                  = "REDIS"
    57  	KIND_SENTINEL               = "SENTINEL"
    58  	APPLY_REDIS_CONFIG          = "APPLY_REDIS_CONFIG"
    59  	APPLY_EXTERNAL_MASTER       = "APPLY_EXT_MASTER_ALL"
    60  	APPLY_SENTINEL_CONFIG       = "APPLY_SENTINEL_CONFIG"
    61  	MONITOR_REDIS_WITH_PORT     = "SET_SENTINEL_TO_MONITOR_REDIS_WITH_GIVEN_PORT"
    62  	RESET_SENTINEL              = "RESET_ALL_SENTINEL_CONFIG"
    63  	GET_NUM_SENTINELS_IN_MEM    = "GET_NUMBER_OF_SENTINELS_IN_MEMORY"    // `info sentinel` command on a sentinel machine > grep sentinel
    64  	GET_NUM_REDIS_SLAVES_IN_MEM = "GET_NUMBER_OF_REDIS_SLAVES_IN_MEMORY" // `info sentinel` command on a sentinel machine > grep slaves
    65  	GET_SLAVE_OF                = "GET_MASTER_OF_GIVEN_SLAVE_INSTANCE"
    66  	IS_MASTER                   = "CHECK_IF_INSTANCE_IS_MASTER"
    67  	MAKE_MASTER                 = "MAKE_INSTANCE_AS_MASTER"
    68  	MAKE_SLAVE_OF               = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE"
    69  	GET_SENTINEL_MONITOR        = "SENTINEL_GET_MASTER_INSTANCE"
    70  	CHECK_SENTINEL_QUORUM       = "SENTINEL_CKQUORUM"
    71  	SLAVE_IS_READY              = "CHECK_IF_SLAVE_IS_READY"
    72  )
    73  
    74  var ( // used for grabage collection of metrics
    75  	mutex                     sync.Mutex
    76  	recorders                 = []recorder{}
    77  	instanceMetricLastUpdated = map[string]time.Time{}
    78  	resourceMetricLastUpdated = map[string]time.Time{}
    79  )
    80  
    81  // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics.
    82  type Recorder interface {
    83  	koopercontroller.MetricsRecorder
    84  
    85  	// ClusterOK metrics
    86  	SetClusterOK(namespace string, name string)
    87  	SetClusterError(namespace string, name string)
    88  	DeleteCluster(namespace string, name string)
    89  
    90  	// Indicate redis instances being monitored
    91  	RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string)
    92  
    93  	RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string)
    94  	RecordSentinelCheck(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string, status string)
    95  
    96  	RecordK8sOperation(namespace string, kind string, name string, operation string, status string, err string)
    97  	RecordRedisOperation(kind string, IP string, operation string, status string, err string)
    98  }
    99  
   100  // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus.
   101  type recorder struct {
   102  	// Metrics fields.
   103  	clusterOK            *prometheus.GaugeVec   // clusterOk is the status of a cluster
   104  	ensureResource       *prometheus.CounterVec // number of successful "ensure" operators performed by the controller.
   105  	redisCheck           *prometheus.CounterVec // indicates any error encountered in managed redis instance(s)
   106  	sentinelCheck        *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s)
   107  	k8sServiceOperations *prometheus.CounterVec // number of operations performed on k8s
   108  	redisOperations      *prometheus.CounterVec // number of operations performed on redis/sentinel instances
   109  	koopercontroller.MetricsRecorder
   110  }
   111  
   112  // NewPrometheusMetrics returns a new PromMetrics object.
   113  func NewRecorder(namespace string, reg prometheus.Registerer) Recorder {
   114  	// Create metrics.
   115  	clusterOK := prometheus.NewGaugeVec(prometheus.GaugeOpts{
   116  		Namespace: namespace,
   117  		Subsystem: promControllerSubsystem,
   118  		Name:      "cluster_ok",
   119  		Help:      "Number of failover clusters managed by the operator.",
   120  	}, []string{"namespace", "name"})
   121  
   122  	ensureResource := prometheus.NewCounterVec(prometheus.CounterOpts{
   123  		Namespace: namespace,
   124  		Subsystem: promControllerSubsystem,
   125  		Name:      "ensure_resource_total",
   126  		Help:      "number of 'ensure' operations on a resource performed by the controller.",
   127  	}, []string{"namespace", "name", "kind", "resource_name", "status"})
   128  
   129  	redisCheck := prometheus.NewCounterVec(prometheus.CounterOpts{
   130  		Namespace: namespace,
   131  		Subsystem: promControllerSubsystem,
   132  		Name:      "redis_checks_total",
   133  		Help:      "indicates any error encountered in managed redis instance(s)",
   134  	}, []string{"namespace", "resource", "indicator", "instance", "status"})
   135  
   136  	sentinelCheck := prometheus.NewCounterVec(prometheus.CounterOpts{
   137  		Namespace: namespace,
   138  		Subsystem: promControllerSubsystem,
   139  		Name:      "sentinel_checks_total",
   140  		Help:      "indicates any error encountered in managed sentinel instance(s)",
   141  	}, []string{"namespace", "resource", "indicator", "instance", "status"})
   142  
   143  	redisOperations := prometheus.NewCounterVec(
   144  		prometheus.CounterOpts{
   145  			Namespace: namespace,
   146  			Subsystem: promControllerSubsystem,
   147  			Name:      "redis_operations_total",
   148  			Help:      "number of operations performed on redis",
   149  		}, []string{"kind" /* redis/sentinel? */, "IP", "operation", "status", "err"})
   150  
   151  	k8sServiceOperations := prometheus.NewCounterVec(
   152  		prometheus.CounterOpts{
   153  			Namespace: namespace,
   154  			Subsystem: promControllerSubsystem,
   155  			Name:      "k8s_operations_total",
   156  			Help:      "number of operations performed on k8s",
   157  		}, []string{"namespace", "kind", "name", "operation", "status", "err"})
   158  
   159  	// Create the instance.
   160  	r := recorder{
   161  		clusterOK:            clusterOK,
   162  		ensureResource:       ensureResource,
   163  		redisCheck:           redisCheck,
   164  		sentinelCheck:        sentinelCheck,
   165  		k8sServiceOperations: k8sServiceOperations,
   166  		redisOperations:      redisOperations,
   167  		MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{
   168  			Registerer: reg,
   169  		}),
   170  	}
   171  
   172  	// Register metrics.
   173  	reg.MustRegister(
   174  		r.clusterOK,
   175  		r.ensureResource,
   176  		r.redisCheck,
   177  		r.sentinelCheck,
   178  		r.k8sServiceOperations,
   179  		r.redisOperations,
   180  	)
   181  	recorders = append(recorders, r)
   182  	return r
   183  }
   184  
   185  // SetClusterOK set the cluster status to OK
   186  func (r recorder) SetClusterOK(namespace string, name string) {
   187  	r.clusterOK.WithLabelValues(namespace, name).Set(1)
   188  }
   189  
   190  // SetClusterError set the cluster status to Error
   191  func (r recorder) SetClusterError(namespace string, name string) {
   192  	r.clusterOK.WithLabelValues(namespace, name).Set(0)
   193  }
   194  
   195  // DeleteCluster set the cluster status to Error
   196  func (r recorder) DeleteCluster(namespace string, name string) {
   197  	r.clusterOK.DeleteLabelValues(namespace, name)
   198  }
   199  
   200  func (r recorder) RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) {
   201  	r.ensureResource.WithLabelValues(objectNamespace, objectName, objectKind, resourceName, status).Add(1)
   202  	updateResourceMetricLastUpdatedTracker(objectNamespace, objectKind, objectName)
   203  }
   204  
   205  func (r recorder) RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string) {
   206  	r.redisCheck.WithLabelValues(namespace, resource, indicator, instance, status).Add(1)
   207  	updateResourceMetricLastUpdatedTracker(namespace, "redisfailover", resource)
   208  }
   209  
   210  func (r recorder) RecordSentinelCheck(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string, status string) {
   211  	r.sentinelCheck.WithLabelValues(namespace, resource, indicator, instance, status).Add(1)
   212  	updateResourceMetricLastUpdatedTracker(namespace, "redisfailover", resource)
   213  }
   214  
   215  func (r recorder) RecordK8sOperation(namespace string, kind string, name string, operation string, status string, err string) {
   216  	r.k8sServiceOperations.WithLabelValues(namespace, kind, name, operation, status, err).Add(1)
   217  	updateResourceMetricLastUpdatedTracker(namespace, kind, name)
   218  }
   219  
   220  func (r recorder) RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) {
   221  	r.redisOperations.WithLabelValues(kind, IP, operation, status, err).Add(1)
   222  	updateInstanceMetricLastUpdatedTracker(IP)
   223  }
   224  
   225  func updateResourceMetricLastUpdatedTracker(namespace string, kind string, name string) {
   226  	mutex.Lock()
   227  	resourceMetricLastUpdated[fmt.Sprintf("%v/%v/%v", namespace, kind, name)] = time.Now()
   228  	mutex.Unlock()
   229  }
   230  
   231  func updateInstanceMetricLastUpdatedTracker(IP string) {
   232  	mutex.Lock()
   233  	instanceMetricLastUpdated[IP] = time.Now()
   234  	mutex.Unlock()
   235  }
   236  
   237  // Garbage collection
   238  func removeStaleMetrics() {
   239  	// Runs every `metricsGCIntervalMinutes`. It keeps track of recently updated metrics
   240  	// And every metric that was not updated after `metricsGCIntervalMinutes` gets deleted
   241  	for {
   242  		metricsDeletedCount := 0
   243  		kubernetesResourceBasedLabels, customResourceBasedLabels, ipBasedLabels := getLabelsOfStaleMetrics()
   244  		for _, recorder := range recorders {
   245  			for _, label := range kubernetesResourceBasedLabels {
   246  				metricsDeletedCount += recorder.ensureResource.DeletePartialMatch(label)
   247  				metricsDeletedCount += recorder.k8sServiceOperations.DeletePartialMatch(label)
   248  			}
   249  			for _, label := range customResourceBasedLabels {
   250  				metricsDeletedCount += recorder.redisCheck.DeletePartialMatch(label)
   251  				metricsDeletedCount += recorder.sentinelCheck.DeletePartialMatch(label)
   252  				labelWithName := label
   253  				labelWithName["name"] = labelWithName["resource"]
   254  				delete(labelWithName, "resource")
   255  				metricsDeletedCount += recorder.clusterOK.DeletePartialMatch(label)
   256  			}
   257  			for _, label := range ipBasedLabels {
   258  				metricsDeletedCount += recorder.redisOperations.DeletePartialMatch(label)
   259  			}
   260  		}
   261  		log.Debugf("delete %v stale metrics", metricsDeletedCount)
   262  		time.Sleep(metricsGCIntervalMinutes * time.Minute)
   263  	}
   264  }
   265  
   266  func getLabelsOfStaleMetrics() (kubernetesResourceBasedLabels []prometheus.Labels, customResourceBasedLabels []prometheus.Labels, ipBasedLabels []prometheus.Labels) {
   267  
   268  	kubernetesResourceBasedLabels = []prometheus.Labels{}
   269  	customResourceBasedLabels = []prometheus.Labels{}
   270  	ipBasedLabels = []prometheus.Labels{}
   271  
   272  	for key, value := range resourceMetricLastUpdated {
   273  		// if the key is stale
   274  		if value.Before(time.Now().Add(-metricsGCIntervalMinutes * time.Minute)) {
   275  			// extract keys and create labels
   276  			ids := strings.Split(key, "/")
   277  			namespace := ids[0]
   278  			kind := ids[1]
   279  			resource := ids[2]
   280  			kubernetesResourceBasedLabels = append(kubernetesResourceBasedLabels,
   281  				prometheus.Labels{
   282  					"namespace": namespace,
   283  					"name":      resource,
   284  					"kind":      kind,
   285  				},
   286  			)
   287  			customResourceBasedLabels = append(customResourceBasedLabels,
   288  				prometheus.Labels{
   289  					"namespace": namespace,
   290  					"resource":  resource,
   291  				},
   292  			)
   293  			// once we have created labels out of the contents of the key,
   294  			// its not longer required - since it is known to be stale. remove it from the tracker.
   295  			mutex.Lock()
   296  			delete(resourceMetricLastUpdated, key)
   297  			mutex.Unlock()
   298  		}
   299  	}
   300  	for IP, value := range instanceMetricLastUpdated {
   301  		if value.Before(time.Now().Add(-metricsGCIntervalMinutes * time.Minute)) {
   302  			ipBasedLabels = append(ipBasedLabels,
   303  				prometheus.Labels{
   304  					"IP": IP,
   305  				},
   306  			)
   307  			// once we have created labels out of the contents of the key,
   308  			// its not longer required - since it is known to be stale. remove it from the tracker.
   309  			mutex.Lock()
   310  			delete(instanceMetricLastUpdated, IP)
   311  			mutex.Unlock()
   312  		}
   313  
   314  	}
   315  	return kubernetesResourceBasedLabels, customResourceBasedLabels, ipBasedLabels
   316  }