github.com/spotahome/redis-operator@v1.2.4/operator/redisfailover/service/check.go (about) 1 package service 2 3 import ( 4 "errors" 5 "fmt" 6 "strconv" 7 "time" 8 9 appsv1 "k8s.io/api/apps/v1" 10 corev1 "k8s.io/api/core/v1" 11 12 redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" 13 "github.com/spotahome/redis-operator/log" 14 "github.com/spotahome/redis-operator/metrics" 15 "github.com/spotahome/redis-operator/service/k8s" 16 "github.com/spotahome/redis-operator/service/redis" 17 ) 18 19 // RedisFailoverCheck defines the interface able to check the correct status of a redis failover 20 type RedisFailoverCheck interface { 21 CheckRedisNumber(rFailover *redisfailoverv1.RedisFailover) error 22 CheckSentinelNumber(rFailover *redisfailoverv1.RedisFailover) error 23 CheckAllSlavesFromMaster(master string, rFailover *redisfailoverv1.RedisFailover) error 24 CheckSentinelNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error 25 CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error 26 CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) 27 CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) 28 CheckSentinelMonitor(sentinel string, monitor ...string) error 29 GetMasterIP(rFailover *redisfailoverv1.RedisFailover) (string, error) 30 GetNumberMasters(rFailover *redisfailoverv1.RedisFailover) (int, error) 31 GetRedisesIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) 32 GetSentinelsIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) 33 GetMaxRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) 34 GetRedisesSlavesPods(rFailover *redisfailoverv1.RedisFailover) ([]string, error) 35 GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) 36 GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) 37 GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error) 38 CheckRedisSlavesReady(slaveIP string, rFailover *redisfailoverv1.RedisFailover) (bool, error) 39 IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool 40 IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool 41 IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool 42 } 43 44 // RedisFailoverChecker is our implementation of RedisFailoverCheck interface 45 type RedisFailoverChecker struct { 46 k8sService k8s.Services 47 redisClient redis.Client 48 logger log.Logger 49 metricsClient metrics.Recorder 50 } 51 52 // NewRedisFailoverChecker creates an object of the RedisFailoverChecker struct 53 func NewRedisFailoverChecker(k8sService k8s.Services, redisClient redis.Client, logger log.Logger, metricsClient metrics.Recorder) *RedisFailoverChecker { 54 return &RedisFailoverChecker{ 55 k8sService: k8sService, 56 redisClient: redisClient, 57 logger: logger, 58 metricsClient: metricsClient, 59 } 60 } 61 62 // CheckRedisNumber controlls that the number of deployed redis is the same than the requested on the spec 63 func (r *RedisFailoverChecker) CheckRedisNumber(rf *redisfailoverv1.RedisFailover) error { 64 ss, err := r.k8sService.GetStatefulSet(rf.Namespace, GetRedisName(rf)) 65 if err != nil { 66 return err 67 } 68 if rf.Spec.Redis.Replicas != *ss.Spec.Replicas { 69 return errors.New("number of redis pods differ from specification") 70 } 71 return nil 72 } 73 74 // CheckSentinelNumber controlls that the number of deployed sentinel is the same than the requested on the spec 75 func (r *RedisFailoverChecker) CheckSentinelNumber(rf *redisfailoverv1.RedisFailover) error { 76 d, err := r.k8sService.GetDeployment(rf.Namespace, GetSentinelName(rf)) 77 if err != nil { 78 return err 79 } 80 if rf.Spec.Sentinel.Replicas != *d.Spec.Replicas { 81 return errors.New("number of sentinel pods differ from specification") 82 } 83 return nil 84 } 85 86 func (r *RedisFailoverChecker) setMasterLabelIfNecessary(namespace string, pod corev1.Pod) error { 87 for labelKey, labelValue := range pod.ObjectMeta.Labels { 88 if labelKey == redisRoleLabelKey && labelValue == redisRoleLabelMaster { 89 return nil 90 } 91 } 92 return r.k8sService.UpdatePodLabels(namespace, pod.ObjectMeta.Name, generateRedisMasterRoleLabel()) 93 } 94 95 func (r *RedisFailoverChecker) setSlaveLabelIfNecessary(namespace string, pod corev1.Pod) error { 96 for labelKey, labelValue := range pod.ObjectMeta.Labels { 97 if labelKey == redisRoleLabelKey && labelValue == redisRoleLabelSlave { 98 return nil 99 } 100 } 101 return r.k8sService.UpdatePodLabels(namespace, pod.ObjectMeta.Name, generateRedisSlaveRoleLabel()) 102 } 103 104 // CheckAllSlavesFromMaster controlls that all slaves have the same master (the real one) 105 func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redisfailoverv1.RedisFailover) error { 106 rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) 107 if err != nil { 108 return err 109 } 110 111 password, err := k8s.GetRedisPassword(r.k8sService, rf) 112 if err != nil { 113 return err 114 } 115 116 rport := getRedisPort(rf.Spec.Redis.Port) 117 for _, rp := range rps.Items { 118 if rp.Status.PodIP == master { 119 err = r.setMasterLabelIfNecessary(rf.Namespace, rp) 120 if err != nil { 121 return err 122 } 123 } else { 124 err = r.setSlaveLabelIfNecessary(rf.Namespace, rp) 125 if err != nil { 126 return err 127 } 128 } 129 130 slave, err := r.redisClient.GetSlaveOf(rp.Status.PodIP, rport, password) 131 if err != nil { 132 r.logger.Errorf("Get slave of master failed, maybe this node is not ready, pod ip: %s", rp.Status.PodIP) 133 return err 134 } 135 if slave != "" && slave != master { 136 return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave) 137 } 138 } 139 return nil 140 } 141 142 // CheckSentinelNumberInMemory controls that the provided sentinel has only the living sentinels on its memory. 143 func (r *RedisFailoverChecker) CheckSentinelNumberInMemory(sentinel string, rf *redisfailoverv1.RedisFailover) error { 144 nSentinels, err := r.redisClient.GetNumberSentinelsInMemory(sentinel) 145 if err != nil { 146 return err 147 } else if nSentinels != rf.Spec.Sentinel.Replicas { 148 return errors.New("sentinels in memory mismatch") 149 } 150 return nil 151 } 152 153 // This function will check if the local host ip is set as the master for all currently available pods 154 // This can be used to detect the fresh boot of all the redis pods 155 // This function returns true if it all available pods have local host ip as master, 156 // false if atleast one of the ip is not local hostip 157 // false and error if any function fails 158 func (r *RedisFailoverChecker) CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) { 159 160 var lhmaster int = 0 161 redisIps, err := r.GetRedisesIPs(rFailover) 162 if len(redisIps) == 0 || err != nil { 163 r.logger.Warningf("CheckIfMasterLocalhost GetRedisesIPs Failed- unable to fetch any redis Ips Currently") 164 return false, errors.New("unable to fetch any redis Ips Currently") 165 } 166 password, err := k8s.GetRedisPassword(r.k8sService, rFailover) 167 if err != nil { 168 r.logger.Errorf("CheckIfMasterLocalhost -- GetRedisPassword Failed") 169 return false, err 170 } 171 rport := getRedisPort(rFailover.Spec.Redis.Port) 172 for _, sip := range redisIps { 173 master, err := r.redisClient.GetSlaveOf(sip, rport, password) 174 if err != nil { 175 r.logger.Warningf("CheckIfMasterLocalhost -- GetSlaveOf Failed") 176 return false, err 177 } else if master == "" { 178 r.logger.Warningf("CheckIfMasterLocalhost -- Master already available ?? check manually") 179 return false, errors.New("unexpected master state, fix manually") 180 } else { 181 if master == "127.0.0.1" { 182 lhmaster++ 183 } 184 } 185 } 186 if lhmaster == len(redisIps) { 187 r.logger.Infof("all available redis configured localhost as master , opertor must heal") 188 return true, nil 189 } 190 r.logger.Infof("atleast one pod does not have localhost as master , opertor should not heal") 191 return false, nil 192 } 193 194 // This function will call the sentinel client apis to check with sentinel if the sentinel is in a state 195 // to heal the redis system 196 func (r *RedisFailoverChecker) CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) { 197 198 var unhealthyCnt int = -1 199 200 sentinels, err := r.GetSentinelsIPs(rFailover) 201 if err != nil { 202 r.logger.Warningf("CheckSentinelQuorum Error in getting sentinel Ip's") 203 return unhealthyCnt, err 204 } 205 if len(sentinels) < int(getQuorum(rFailover)) { 206 unhealthyCnt = int(getQuorum(rFailover)) - len(sentinels) 207 r.logger.Warningf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) 208 return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") 209 } 210 211 unhealthyCnt = 0 212 for _, sip := range sentinels { 213 err = r.redisClient.SentinelCheckQuorum(sip) 214 if err != nil { 215 unhealthyCnt += 1 216 } else { 217 continue 218 } 219 } 220 if unhealthyCnt < int(getQuorum(rFailover)) { 221 return unhealthyCnt, nil 222 } else { 223 r.logger.Errorf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) 224 return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") 225 } 226 } 227 228 // CheckSentinelSlavesNumberInMemory controls that the provided sentinel has only the expected slaves number. 229 func (r *RedisFailoverChecker) CheckSentinelSlavesNumberInMemory(sentinel string, rf *redisfailoverv1.RedisFailover) error { 230 nSlaves, err := r.redisClient.GetNumberSentinelSlavesInMemory(sentinel) 231 if err != nil { 232 return err 233 } else { 234 if rf.Bootstrapping() { 235 if nSlaves != rf.Spec.Redis.Replicas { 236 return errors.New("redis slaves in sentinel memory mismatch") 237 } 238 } else { 239 if nSlaves != rf.Spec.Redis.Replicas-1 { 240 return errors.New("redis slaves in sentinel memory mismatch") 241 } 242 } 243 } 244 return nil 245 246 } 247 248 // CheckSentinelMonitor controls if the sentinels are monitoring the expected master 249 func (r *RedisFailoverChecker) CheckSentinelMonitor(sentinel string, monitor ...string) error { 250 monitorIP := monitor[0] 251 monitorPort := "" 252 if len(monitor) > 1 { 253 monitorPort = monitor[1] 254 } 255 actualMonitorIP, actualMonitorPort, err := r.redisClient.GetSentinelMonitor(sentinel) 256 if err != nil { 257 return err 258 } 259 if actualMonitorIP != monitorIP || (monitorPort != "" && monitorPort != actualMonitorPort) { 260 return fmt.Errorf("sentinel monitoring %s:%s instead %s:%s", actualMonitorIP, actualMonitorPort, monitorIP, monitorPort) 261 } 262 return nil 263 } 264 265 // GetMasterIP connects to all redis and returns the master of the redis failover 266 func (r *RedisFailoverChecker) GetMasterIP(rf *redisfailoverv1.RedisFailover) (string, error) { 267 rips, err := r.GetRedisesIPs(rf) 268 if err != nil { 269 return "", err 270 } 271 272 password, err := k8s.GetRedisPassword(r.k8sService, rf) 273 if err != nil { 274 return "", err 275 } 276 277 masters := []string{} 278 rport := getRedisPort(rf.Spec.Redis.Port) 279 for _, rip := range rips { 280 master, err := r.redisClient.IsMaster(rip, rport, password) 281 if err != nil { 282 r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip) 283 continue 284 } 285 if master { 286 masters = append(masters, rip) 287 } 288 } 289 290 if len(masters) != 1 { 291 return "", errors.New("number of redis nodes known as master is different than 1") 292 } 293 return masters[0], nil 294 } 295 296 // GetNumberMasters returns the number of redis nodes that are working as a master 297 func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailover) (int, error) { 298 nMasters := 0 299 rips, err := r.GetRedisesIPs(rf) 300 if err != nil { 301 r.logger.Errorf(err.Error()) 302 return nMasters, err 303 } 304 305 password, err := k8s.GetRedisPassword(r.k8sService, rf) 306 if err != nil { 307 r.logger.Errorf("Error getting password: %s", err.Error()) 308 return nMasters, err 309 } 310 311 rport := getRedisPort(rf.Spec.Redis.Port) 312 for _, rip := range rips { 313 master, err := r.redisClient.IsMaster(rip, rport, password) 314 if err != nil { 315 r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip) 316 continue 317 } 318 if master { 319 nMasters++ 320 } 321 } 322 return nMasters, nil 323 } 324 325 // GetRedisesIPs returns the IPs of the Redis nodes 326 func (r *RedisFailoverChecker) GetRedisesIPs(rf *redisfailoverv1.RedisFailover) ([]string, error) { 327 redises := []string{} 328 rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) 329 if err != nil { 330 return nil, err 331 } 332 for _, rp := range rps.Items { 333 if rp.Status.Phase == corev1.PodRunning && rp.DeletionTimestamp == nil { // Only work with running pods 334 redises = append(redises, rp.Status.PodIP) 335 } 336 } 337 return redises, nil 338 } 339 340 // GetSentinelsIPs returns the IPs of the Sentinel nodes 341 func (r *RedisFailoverChecker) GetSentinelsIPs(rf *redisfailoverv1.RedisFailover) ([]string, error) { 342 sentinels := []string{} 343 rps, err := r.k8sService.GetDeploymentPods(rf.Namespace, GetSentinelName(rf)) 344 if err != nil { 345 return nil, err 346 } 347 for _, sp := range rps.Items { 348 if sp.Status.Phase == corev1.PodRunning && sp.DeletionTimestamp == nil { // Only work with running pods 349 sentinels = append(sentinels, sp.Status.PodIP) 350 } 351 } 352 return sentinels, nil 353 } 354 355 // GetMaxRedisPodTime returns the MAX uptime among the active Pods 356 func (r *RedisFailoverChecker) GetMaxRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { 357 maxTime := 0 * time.Hour 358 rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) 359 if err != nil { 360 return maxTime, err 361 } 362 for _, redisNode := range rps.Items { 363 if redisNode.Status.StartTime == nil { 364 continue 365 } 366 start := redisNode.Status.StartTime.Round(time.Second) 367 alive := time.Since(start) 368 r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) 369 if alive > maxTime { 370 maxTime = alive 371 } 372 } 373 return maxTime, nil 374 } 375 376 // GetRedisesSlavesPods returns pods names of the Redis slave nodes 377 func (r *RedisFailoverChecker) GetRedisesSlavesPods(rf *redisfailoverv1.RedisFailover) ([]string, error) { 378 redises := []string{} 379 rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) 380 if err != nil { 381 return nil, err 382 } 383 384 password, err := k8s.GetRedisPassword(r.k8sService, rf) 385 if err != nil { 386 return redises, err 387 } 388 389 rport := getRedisPort(rf.Spec.Redis.Port) 390 for _, rp := range rps.Items { 391 if rp.Status.Phase == corev1.PodRunning && rp.DeletionTimestamp == nil { // Only work with running 392 master, err := r.redisClient.IsMaster(rp.Status.PodIP, rport, password) 393 if err != nil { 394 return []string{}, err 395 } 396 if !master { 397 redises = append(redises, rp.ObjectMeta.Name) 398 } 399 } 400 } 401 return redises, nil 402 } 403 404 // GetRedisesMasterPod returns pods names of the Redis slave nodes 405 func (r *RedisFailoverChecker) GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) { 406 rps, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover)) 407 if err != nil { 408 return "", err 409 } 410 411 password, err := k8s.GetRedisPassword(r.k8sService, rFailover) 412 if err != nil { 413 return "", err 414 } 415 416 rport := getRedisPort(rFailover.Spec.Redis.Port) 417 for _, rp := range rps.Items { 418 if rp.Status.Phase == corev1.PodRunning && rp.DeletionTimestamp == nil { // Only work with running 419 master, err := r.redisClient.IsMaster(rp.Status.PodIP, rport, password) 420 if err != nil { 421 return "", err 422 } 423 if master { 424 return rp.ObjectMeta.Name, nil 425 } 426 } 427 } 428 return "", errors.New("redis nodes known as master not found") 429 } 430 431 // GetStatefulSetUpdateRevision returns current version for the statefulSet 432 // If the label don't exists, we return an empty value and no error, so previous versions don't break 433 func (r *RedisFailoverChecker) GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) { 434 ss, err := r.k8sService.GetStatefulSet(rFailover.Namespace, GetRedisName(rFailover)) 435 if err != nil { 436 return "", err 437 } 438 439 if ss == nil { 440 return "", errors.New("statefulSet not found") 441 } 442 443 return ss.Status.UpdateRevision, nil 444 } 445 446 // GetRedisRevisionHash returns the statefulset uid for the pod 447 func (r *RedisFailoverChecker) GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error) { 448 pod, err := r.k8sService.GetPod(rFailover.Namespace, podName) 449 if err != nil { 450 return "", err 451 } 452 453 if pod == nil { 454 return "", errors.New("pod not found") 455 } 456 457 if pod.ObjectMeta.Labels == nil { 458 return "", errors.New("labels not found") 459 } 460 461 val := pod.ObjectMeta.Labels[appsv1.ControllerRevisionHashLabelKey] 462 463 return val, nil 464 } 465 466 // CheckRedisSlavesReady returns true if the slave is ready (sync, connected, etc) 467 func (r *RedisFailoverChecker) CheckRedisSlavesReady(ip string, rFailover *redisfailoverv1.RedisFailover) (bool, error) { 468 password, err := k8s.GetRedisPassword(r.k8sService, rFailover) 469 if err != nil { 470 return false, err 471 } 472 473 port := getRedisPort(rFailover.Spec.Redis.Port) 474 return r.redisClient.SlaveIsReady(ip, port, password) 475 } 476 477 // IsRedisRunning returns true if all the pods are Running 478 func (r *RedisFailoverChecker) IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool { 479 dp, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover)) 480 return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp) 481 } 482 483 // IsSentinelRunning returns true if all the pods are Running 484 func (r *RedisFailoverChecker) IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool { 485 dp, err := r.k8sService.GetDeploymentPods(rFailover.Namespace, GetSentinelName(rFailover)) 486 return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp) 487 } 488 489 // IsClusterRunning returns true if all the pods in the given redisfailover are Running 490 func (r *RedisFailoverChecker) IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool { 491 return r.IsSentinelRunning(rFailover) && r.IsRedisRunning(rFailover) 492 } 493 494 func getRedisPort(p int32) string { 495 return strconv.Itoa(int(p)) 496 } 497 498 func AreAllRunning(pods *corev1.PodList) bool { 499 for _, pod := range pods.Items { 500 if pod.Status.Phase != corev1.PodRunning || pod.DeletionTimestamp != nil { 501 return false 502 } 503 } 504 return true 505 }