github.com/spotahome/redis-operator@v1.2.4/operator/redisfailover/checker.go (about) 1 package redisfailover 2 3 import ( 4 "errors" 5 "strconv" 6 "time" 7 8 redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" 9 "github.com/spotahome/redis-operator/metrics" 10 ) 11 12 // UpdateRedisesPods if the running version of pods are equal to the statefulset one 13 func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error { 14 redises, err := r.rfChecker.GetRedisesIPs(rf) 15 if err != nil { 16 return err 17 } 18 19 masterIP := "" 20 if !rf.Bootstrapping() { 21 masterIP, _ = r.rfChecker.GetMasterIP(rf) 22 } 23 // No perform updates when nodes are syncing, still not connected, etc. 24 for _, rip := range redises { 25 if rip != masterIP { 26 ready, err := r.rfChecker.CheckRedisSlavesReady(rip, rf) 27 if err != nil { 28 return err 29 } 30 if !ready { 31 return nil 32 } 33 } 34 } 35 36 ssUR, err := r.rfChecker.GetStatefulSetUpdateRevision(rf) 37 if err != nil { 38 return err 39 } 40 41 redisesPods, err := r.rfChecker.GetRedisesSlavesPods(rf) 42 if err != nil { 43 return err 44 } 45 46 // Update stale pods with slave role 47 for _, pod := range redisesPods { 48 revision, err := r.rfChecker.GetRedisRevisionHash(pod, rf) 49 if err != nil { 50 return err 51 } 52 if revision != ssUR { 53 //Delete pod and wait next round to check if the new one is synced 54 err = r.rfHealer.DeletePod(pod, rf) 55 if err != nil { 56 return err 57 } 58 return nil 59 } 60 } 61 62 if !rf.Bootstrapping() { 63 // Update stale pod with role master 64 master, err := r.rfChecker.GetRedisesMasterPod(rf) 65 if err != nil { 66 return err 67 } 68 69 masterRevision, err := r.rfChecker.GetRedisRevisionHash(master, rf) 70 if err != nil { 71 return err 72 } 73 if masterRevision != ssUR { 74 err = r.rfHealer.DeletePod(master, rf) 75 if err != nil { 76 return err 77 } 78 return nil 79 } 80 } 81 82 return nil 83 } 84 85 // CheckAndHeal runs verifcation checks to ensure the RedisFailover is in an expected and healthy state. 86 // If the checks do not match up to expectations, an attempt will be made to "heal" the RedisFailover into a healthy state. 87 func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) error { 88 if rf.Bootstrapping() { 89 return r.checkAndHealBootstrapMode(rf) 90 } 91 92 // Number of redis is equal as the set on the RF spec 93 // Number of sentinel is equal as the set on the RF spec 94 // Check only one master 95 // Number of redis master is 1 96 // All redis slaves have the same master 97 // All sentinels points to the same redis master 98 // Sentinel has not death nodes 99 // Sentinel knows the correct slave number 100 101 if !r.rfChecker.IsRedisRunning(rf) { 102 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) 103 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") 104 return nil 105 } 106 107 if !r.rfChecker.IsSentinelRunning(rf) { 108 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) 109 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") 110 return nil 111 } 112 113 nMasters, err := r.rfChecker.GetNumberMasters(rf) 114 if err != nil { 115 return err 116 } 117 118 switch nMasters { 119 case 0: 120 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no masters detected")) 121 //when number of redis replicas is 1 , the redis is configured for standalone master mode 122 //Configure to master 123 if rf.Spec.Redis.Replicas == 1 { 124 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master") 125 err = r.rfHealer.SetOldestAsMaster(rf) 126 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err) 127 if err != nil { 128 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") 129 return err 130 } 131 return nil 132 } 133 //During the First boot(New deployment or all pods of the statefulsets have restarted), 134 //Sentinesl will not be able to choose the master , so operator should select a master 135 //Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached 136 //Operator can choose a master , These scenarios can be checked by asking the all the sentinels 137 //if its in a postion to choose a master also check if the redis is configured with local host IP as master. 138 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0") 139 maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf) 140 if err != nil { 141 return err 142 } 143 144 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds()) 145 //Check If Sentinel has quorum to take a failover decision 146 noqrm_cnt, err := r.rfChecker.CheckSentinelQuorum(rf) 147 if err != nil { 148 // Sentinels are not in a situation to choose a master we pick one 149 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt) 150 err2 := r.rfHealer.SetOldestAsMaster(rf) 151 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2) 152 if err2 != nil { 153 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") 154 return err2 155 } 156 } else { 157 //sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master 158 status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf) 159 if err2 != nil { 160 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later") 161 return err2 162 } else if status { 163 // all avaialable redis pods have local host ip as master 164 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection") 165 err3 := r.rfHealer.SetOldestAsMaster(rf) 166 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3) 167 if err3 != nil { 168 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") 169 return err3 170 } 171 172 } else { 173 174 // We'll wait until failover is done 175 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("no master found, wait until failover or fix manually") 176 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no master not fixed, wait until failover or fix manually")) 177 return nil 178 } 179 180 } 181 182 case 1: 183 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) 184 default: 185 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected")) 186 return errors.New("more than one master, fix manually") 187 } 188 189 master, err := r.rfChecker.GetMasterIP(rf) 190 if err != nil { 191 return err 192 } 193 194 err = r.rfChecker.CheckAllSlavesFromMaster(master, rf) 195 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err) 196 if err != nil { 197 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error()) 198 if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil { 199 return err 200 } 201 } 202 203 err = r.applyRedisCustomConfig(rf) 204 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err) 205 if err != nil { 206 return err 207 } 208 209 err = r.UpdateRedisesPods(rf) 210 if err != nil { 211 return err 212 } 213 214 sentinels, err := r.rfChecker.GetSentinelsIPs(rf) 215 if err != nil { 216 return err 217 } 218 219 port := getRedisPort(rf.Spec.Redis.Port) 220 for _, sip := range sentinels { 221 err = r.rfChecker.CheckSentinelMonitor(sip, master, port) 222 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) 223 if err != nil { 224 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) 225 if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { 226 return err 227 } 228 } 229 } 230 return r.checkAndHealSentinels(rf, sentinels) 231 } 232 233 func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error { 234 235 if !r.rfChecker.IsRedisRunning(rf) { 236 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) 237 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") 238 return nil 239 } 240 241 err := r.UpdateRedisesPods(rf) 242 if err != nil { 243 return err 244 } 245 err = r.applyRedisCustomConfig(rf) 246 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err) 247 if err != nil { 248 return err 249 } 250 251 bootstrapSettings := rf.Spec.BootstrapNode 252 err = r.rfHealer.SetExternalMasterOnAll(bootstrapSettings.Host, bootstrapSettings.Port, rf) 253 setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_EXTERNAL_MASTER, metrics.NOT_APPLICABLE, err) 254 if err != nil { 255 return err 256 } 257 258 if rf.SentinelsAllowed() { 259 if !r.rfChecker.IsSentinelRunning(rf) { 260 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) 261 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") 262 return nil 263 } 264 265 sentinels, err := r.rfChecker.GetSentinelsIPs(rf) 266 if err != nil { 267 return err 268 } 269 for _, sip := range sentinels { 270 err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port) 271 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) 272 if err != nil { 273 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) 274 if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil { 275 return err 276 } 277 } 278 } 279 return r.checkAndHealSentinels(rf, sentinels) 280 } 281 return nil 282 } 283 284 func (r *RedisFailoverHandler) applyRedisCustomConfig(rf *redisfailoverv1.RedisFailover) error { 285 redises, err := r.rfChecker.GetRedisesIPs(rf) 286 if err != nil { 287 return err 288 } 289 for _, rip := range redises { 290 if err := r.rfHealer.SetRedisCustomConfig(rip, rf); err != nil { 291 return err 292 } 293 } 294 return nil 295 } 296 297 func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFailover, sentinels []string) error { 298 for _, sip := range sentinels { 299 err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf) 300 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err) 301 if err != nil { 302 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of sentinels in memory. resetting", sip) 303 if err := r.rfHealer.RestoreSentinel(sip); err != nil { 304 return err 305 } 306 } 307 308 } 309 for _, sip := range sentinels { 310 err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf) 311 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err) 312 if err != nil { 313 r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of expected slaves in memory. resetting", sip) 314 if err := r.rfHealer.RestoreSentinel(sip); err != nil { 315 return err 316 } 317 } 318 } 319 for _, sip := range sentinels { 320 err := r.rfHealer.SetSentinelCustomConfig(sip, rf) 321 setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.APPLY_SENTINEL_CONFIG, sip, err) 322 if err != nil { 323 return err 324 } 325 } 326 return nil 327 } 328 329 func getRedisPort(p int32) string { 330 return strconv.Itoa(int(p)) 331 } 332 333 func setRedisCheckerMetrics(metricsClient metrics.Recorder, mode /* redis or sentinel? */ string, rfNamespace string, rfName string, property string, IP string, err error) { 334 if mode == "sentinel" { 335 if err != nil { 336 metricsClient.RecordSentinelCheck(rfNamespace, rfName, property, IP, metrics.STATUS_UNHEALTHY) 337 } else { 338 metricsClient.RecordSentinelCheck(rfNamespace, rfName, property, IP, metrics.STATUS_HEALTHY) 339 } 340 341 } else if mode == "redis" { 342 if err != nil { 343 metricsClient.RecordRedisCheck(rfNamespace, rfName, property, IP, metrics.STATUS_UNHEALTHY) 344 } else { 345 metricsClient.RecordRedisCheck(rfNamespace, rfName, property, IP, metrics.STATUS_HEALTHY) 346 } 347 } 348 }