github.com/spotahome/redis-operator@v1.2.4/metrics/metrics.go (about) 1 package metrics 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "time" 8 9 "github.com/prometheus/client_golang/prometheus" 10 koopercontroller "github.com/spotahome/kooper/v2/controller" 11 kooperprometheus "github.com/spotahome/kooper/v2/metrics/prometheus" 12 "github.com/spotahome/redis-operator/log" 13 ) 14 15 const ( 16 promControllerSubsystem = "controller" 17 metricsGCIntervalMinutes = 5 18 ) 19 20 func init() { 21 go removeStaleMetrics() 22 } 23 24 // variables for setting various indicator labels 25 const ( 26 SUCCESS = "SUCCESS" 27 FAIL = "FAIL" 28 STATUS_HEALTHY = "HEALTHY" 29 STATUS_UNHEALTHY = "UNHEALTHY" 30 NOT_APPLICABLE = "NA" 31 UNHEALTHY = 1.0 32 HEALTHY = 0.0 33 REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH" 34 SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH" 35 NO_MASTER = "NO_MASTER_AVAILABLE" 36 NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE" 37 SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP" 38 SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP" 39 SENTINEL_NOT_READY = "SENTINEL_NOT_READY" 40 REGEX_NOT_FOUND = "SENTINEL_REGEX_NOT_FOUND" 41 MISC = "MISC_ERROR" 42 SENTINEL_NUMBER_IN_MEMORY_MISMATCH = "SENTINEL_NUMBER_IN_MEMORY_MISMATCH" 43 REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH = "REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH" 44 // redis connection related errors 45 WRONG_PASSWORD_USED = "WRONG_PASSWORD_USED" 46 NOAUTH = "AUTH_CREDENTIALS_NOT_PROVIDED" 47 NOPERM = "REDIS_USER_DOES_NOT_HAVE_PERMISSIONS" 48 IO_TIMEOUT = "CONNECTION_TIMEDOUT" 49 CONNECTION_REFUSED = "CONNECTION_REFUSED" 50 51 K8S_FORBIDDEN_ERR = "USER_FORBIDDEN_TO_PERFORM_ACTION" 52 K8S_UNAUTH = "CLIENT_NOT_AUTHORISED" 53 K8S_MISC = "MISC_ERROR_CHECK_LOGS" 54 K8S_NOT_FOUND = "RESOURCE_NOT_FOUND" 55 56 KIND_REDIS = "REDIS" 57 KIND_SENTINEL = "SENTINEL" 58 APPLY_REDIS_CONFIG = "APPLY_REDIS_CONFIG" 59 APPLY_EXTERNAL_MASTER = "APPLY_EXT_MASTER_ALL" 60 APPLY_SENTINEL_CONFIG = "APPLY_SENTINEL_CONFIG" 61 MONITOR_REDIS_WITH_PORT = "SET_SENTINEL_TO_MONITOR_REDIS_WITH_GIVEN_PORT" 62 RESET_SENTINEL = "RESET_ALL_SENTINEL_CONFIG" 63 GET_NUM_SENTINELS_IN_MEM = "GET_NUMBER_OF_SENTINELS_IN_MEMORY" // `info sentinel` command on a sentinel machine > grep sentinel 64 GET_NUM_REDIS_SLAVES_IN_MEM = "GET_NUMBER_OF_REDIS_SLAVES_IN_MEMORY" // `info sentinel` command on a sentinel machine > grep slaves 65 GET_SLAVE_OF = "GET_MASTER_OF_GIVEN_SLAVE_INSTANCE" 66 IS_MASTER = "CHECK_IF_INSTANCE_IS_MASTER" 67 MAKE_MASTER = "MAKE_INSTANCE_AS_MASTER" 68 MAKE_SLAVE_OF = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE" 69 GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE" 70 CHECK_SENTINEL_QUORUM = "SENTINEL_CKQUORUM" 71 SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY" 72 ) 73 74 var ( // used for grabage collection of metrics 75 mutex sync.Mutex 76 recorders = []recorder{} 77 instanceMetricLastUpdated = map[string]time.Time{} 78 resourceMetricLastUpdated = map[string]time.Time{} 79 ) 80 81 // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics. 82 type Recorder interface { 83 koopercontroller.MetricsRecorder 84 85 // ClusterOK metrics 86 SetClusterOK(namespace string, name string) 87 SetClusterError(namespace string, name string) 88 DeleteCluster(namespace string, name string) 89 90 // Indicate redis instances being monitored 91 RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) 92 93 RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string) 94 RecordSentinelCheck(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string, status string) 95 96 RecordK8sOperation(namespace string, kind string, name string, operation string, status string, err string) 97 RecordRedisOperation(kind string, IP string, operation string, status string, err string) 98 } 99 100 // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. 101 type recorder struct { 102 // Metrics fields. 103 clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster 104 ensureResource *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. 105 redisCheck *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) 106 sentinelCheck *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) 107 k8sServiceOperations *prometheus.CounterVec // number of operations performed on k8s 108 redisOperations *prometheus.CounterVec // number of operations performed on redis/sentinel instances 109 koopercontroller.MetricsRecorder 110 } 111 112 // NewPrometheusMetrics returns a new PromMetrics object. 113 func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { 114 // Create metrics. 115 clusterOK := prometheus.NewGaugeVec(prometheus.GaugeOpts{ 116 Namespace: namespace, 117 Subsystem: promControllerSubsystem, 118 Name: "cluster_ok", 119 Help: "Number of failover clusters managed by the operator.", 120 }, []string{"namespace", "name"}) 121 122 ensureResource := prometheus.NewCounterVec(prometheus.CounterOpts{ 123 Namespace: namespace, 124 Subsystem: promControllerSubsystem, 125 Name: "ensure_resource_total", 126 Help: "number of 'ensure' operations on a resource performed by the controller.", 127 }, []string{"namespace", "name", "kind", "resource_name", "status"}) 128 129 redisCheck := prometheus.NewCounterVec(prometheus.CounterOpts{ 130 Namespace: namespace, 131 Subsystem: promControllerSubsystem, 132 Name: "redis_checks_total", 133 Help: "indicates any error encountered in managed redis instance(s)", 134 }, []string{"namespace", "resource", "indicator", "instance", "status"}) 135 136 sentinelCheck := prometheus.NewCounterVec(prometheus.CounterOpts{ 137 Namespace: namespace, 138 Subsystem: promControllerSubsystem, 139 Name: "sentinel_checks_total", 140 Help: "indicates any error encountered in managed sentinel instance(s)", 141 }, []string{"namespace", "resource", "indicator", "instance", "status"}) 142 143 redisOperations := prometheus.NewCounterVec( 144 prometheus.CounterOpts{ 145 Namespace: namespace, 146 Subsystem: promControllerSubsystem, 147 Name: "redis_operations_total", 148 Help: "number of operations performed on redis", 149 }, []string{"kind" /* redis/sentinel? */, "IP", "operation", "status", "err"}) 150 151 k8sServiceOperations := prometheus.NewCounterVec( 152 prometheus.CounterOpts{ 153 Namespace: namespace, 154 Subsystem: promControllerSubsystem, 155 Name: "k8s_operations_total", 156 Help: "number of operations performed on k8s", 157 }, []string{"namespace", "kind", "name", "operation", "status", "err"}) 158 159 // Create the instance. 160 r := recorder{ 161 clusterOK: clusterOK, 162 ensureResource: ensureResource, 163 redisCheck: redisCheck, 164 sentinelCheck: sentinelCheck, 165 k8sServiceOperations: k8sServiceOperations, 166 redisOperations: redisOperations, 167 MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ 168 Registerer: reg, 169 }), 170 } 171 172 // Register metrics. 173 reg.MustRegister( 174 r.clusterOK, 175 r.ensureResource, 176 r.redisCheck, 177 r.sentinelCheck, 178 r.k8sServiceOperations, 179 r.redisOperations, 180 ) 181 recorders = append(recorders, r) 182 return r 183 } 184 185 // SetClusterOK set the cluster status to OK 186 func (r recorder) SetClusterOK(namespace string, name string) { 187 r.clusterOK.WithLabelValues(namespace, name).Set(1) 188 } 189 190 // SetClusterError set the cluster status to Error 191 func (r recorder) SetClusterError(namespace string, name string) { 192 r.clusterOK.WithLabelValues(namespace, name).Set(0) 193 } 194 195 // DeleteCluster set the cluster status to Error 196 func (r recorder) DeleteCluster(namespace string, name string) { 197 r.clusterOK.DeleteLabelValues(namespace, name) 198 } 199 200 func (r recorder) RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) { 201 r.ensureResource.WithLabelValues(objectNamespace, objectName, objectKind, resourceName, status).Add(1) 202 updateResourceMetricLastUpdatedTracker(objectNamespace, objectKind, objectName) 203 } 204 205 func (r recorder) RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string) { 206 r.redisCheck.WithLabelValues(namespace, resource, indicator, instance, status).Add(1) 207 updateResourceMetricLastUpdatedTracker(namespace, "redisfailover", resource) 208 } 209 210 func (r recorder) RecordSentinelCheck(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string, status string) { 211 r.sentinelCheck.WithLabelValues(namespace, resource, indicator, instance, status).Add(1) 212 updateResourceMetricLastUpdatedTracker(namespace, "redisfailover", resource) 213 } 214 215 func (r recorder) RecordK8sOperation(namespace string, kind string, name string, operation string, status string, err string) { 216 r.k8sServiceOperations.WithLabelValues(namespace, kind, name, operation, status, err).Add(1) 217 updateResourceMetricLastUpdatedTracker(namespace, kind, name) 218 } 219 220 func (r recorder) RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) { 221 r.redisOperations.WithLabelValues(kind, IP, operation, status, err).Add(1) 222 updateInstanceMetricLastUpdatedTracker(IP) 223 } 224 225 func updateResourceMetricLastUpdatedTracker(namespace string, kind string, name string) { 226 mutex.Lock() 227 resourceMetricLastUpdated[fmt.Sprintf("%v/%v/%v", namespace, kind, name)] = time.Now() 228 mutex.Unlock() 229 } 230 231 func updateInstanceMetricLastUpdatedTracker(IP string) { 232 mutex.Lock() 233 instanceMetricLastUpdated[IP] = time.Now() 234 mutex.Unlock() 235 } 236 237 // Garbage collection 238 func removeStaleMetrics() { 239 // Runs every `metricsGCIntervalMinutes`. It keeps track of recently updated metrics 240 // And every metric that was not updated after `metricsGCIntervalMinutes` gets deleted 241 for { 242 metricsDeletedCount := 0 243 kubernetesResourceBasedLabels, customResourceBasedLabels, ipBasedLabels := getLabelsOfStaleMetrics() 244 for _, recorder := range recorders { 245 for _, label := range kubernetesResourceBasedLabels { 246 metricsDeletedCount += recorder.ensureResource.DeletePartialMatch(label) 247 metricsDeletedCount += recorder.k8sServiceOperations.DeletePartialMatch(label) 248 } 249 for _, label := range customResourceBasedLabels { 250 metricsDeletedCount += recorder.redisCheck.DeletePartialMatch(label) 251 metricsDeletedCount += recorder.sentinelCheck.DeletePartialMatch(label) 252 labelWithName := label 253 labelWithName["name"] = labelWithName["resource"] 254 delete(labelWithName, "resource") 255 metricsDeletedCount += recorder.clusterOK.DeletePartialMatch(label) 256 } 257 for _, label := range ipBasedLabels { 258 metricsDeletedCount += recorder.redisOperations.DeletePartialMatch(label) 259 } 260 } 261 log.Debugf("delete %v stale metrics", metricsDeletedCount) 262 time.Sleep(metricsGCIntervalMinutes * time.Minute) 263 } 264 } 265 266 func getLabelsOfStaleMetrics() (kubernetesResourceBasedLabels []prometheus.Labels, customResourceBasedLabels []prometheus.Labels, ipBasedLabels []prometheus.Labels) { 267 268 kubernetesResourceBasedLabels = []prometheus.Labels{} 269 customResourceBasedLabels = []prometheus.Labels{} 270 ipBasedLabels = []prometheus.Labels{} 271 272 for key, value := range resourceMetricLastUpdated { 273 // if the key is stale 274 if value.Before(time.Now().Add(-metricsGCIntervalMinutes * time.Minute)) { 275 // extract keys and create labels 276 ids := strings.Split(key, "/") 277 namespace := ids[0] 278 kind := ids[1] 279 resource := ids[2] 280 kubernetesResourceBasedLabels = append(kubernetesResourceBasedLabels, 281 prometheus.Labels{ 282 "namespace": namespace, 283 "name": resource, 284 "kind": kind, 285 }, 286 ) 287 customResourceBasedLabels = append(customResourceBasedLabels, 288 prometheus.Labels{ 289 "namespace": namespace, 290 "resource": resource, 291 }, 292 ) 293 // once we have created labels out of the contents of the key, 294 // its not longer required - since it is known to be stale. remove it from the tracker. 295 mutex.Lock() 296 delete(resourceMetricLastUpdated, key) 297 mutex.Unlock() 298 } 299 } 300 for IP, value := range instanceMetricLastUpdated { 301 if value.Before(time.Now().Add(-metricsGCIntervalMinutes * time.Minute)) { 302 ipBasedLabels = append(ipBasedLabels, 303 prometheus.Labels{ 304 "IP": IP, 305 }, 306 ) 307 // once we have created labels out of the contents of the key, 308 // its not longer required - since it is known to be stale. remove it from the tracker. 309 mutex.Lock() 310 delete(instanceMetricLastUpdated, IP) 311 mutex.Unlock() 312 } 313 314 } 315 return kubernetesResourceBasedLabels, customResourceBasedLabels, ipBasedLabels 316 }