github.com/spotahome/redis-operator@v1.2.4/operator/redisfailover/checker.go (about)

     1  package redisfailover
     2  
     3  import (
     4  	"errors"
     5  	"strconv"
     6  	"time"
     7  
     8  	redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1"
     9  	"github.com/spotahome/redis-operator/metrics"
    10  )
    11  
    12  // UpdateRedisesPods if the running version of pods are equal to the statefulset one
    13  func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error {
    14  	redises, err := r.rfChecker.GetRedisesIPs(rf)
    15  	if err != nil {
    16  		return err
    17  	}
    18  
    19  	masterIP := ""
    20  	if !rf.Bootstrapping() {
    21  		masterIP, _ = r.rfChecker.GetMasterIP(rf)
    22  	}
    23  	// No perform updates when nodes are syncing, still not connected, etc.
    24  	for _, rip := range redises {
    25  		if rip != masterIP {
    26  			ready, err := r.rfChecker.CheckRedisSlavesReady(rip, rf)
    27  			if err != nil {
    28  				return err
    29  			}
    30  			if !ready {
    31  				return nil
    32  			}
    33  		}
    34  	}
    35  
    36  	ssUR, err := r.rfChecker.GetStatefulSetUpdateRevision(rf)
    37  	if err != nil {
    38  		return err
    39  	}
    40  
    41  	redisesPods, err := r.rfChecker.GetRedisesSlavesPods(rf)
    42  	if err != nil {
    43  		return err
    44  	}
    45  
    46  	// Update stale pods with slave role
    47  	for _, pod := range redisesPods {
    48  		revision, err := r.rfChecker.GetRedisRevisionHash(pod, rf)
    49  		if err != nil {
    50  			return err
    51  		}
    52  		if revision != ssUR {
    53  			//Delete pod and wait next round to check if the new one is synced
    54  			err = r.rfHealer.DeletePod(pod, rf)
    55  			if err != nil {
    56  				return err
    57  			}
    58  			return nil
    59  		}
    60  	}
    61  
    62  	if !rf.Bootstrapping() {
    63  		// Update stale pod with role master
    64  		master, err := r.rfChecker.GetRedisesMasterPod(rf)
    65  		if err != nil {
    66  			return err
    67  		}
    68  
    69  		masterRevision, err := r.rfChecker.GetRedisRevisionHash(master, rf)
    70  		if err != nil {
    71  			return err
    72  		}
    73  		if masterRevision != ssUR {
    74  			err = r.rfHealer.DeletePod(master, rf)
    75  			if err != nil {
    76  				return err
    77  			}
    78  			return nil
    79  		}
    80  	}
    81  
    82  	return nil
    83  }
    84  
    85  // CheckAndHeal runs verifcation checks to ensure the RedisFailover is in an expected and healthy state.
    86  // If the checks do not match up to expectations, an attempt will be made to "heal" the RedisFailover into a healthy state.
    87  func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) error {
    88  	if rf.Bootstrapping() {
    89  		return r.checkAndHealBootstrapMode(rf)
    90  	}
    91  
    92  	// Number of redis is equal as the set on the RF spec
    93  	// Number of sentinel is equal as the set on the RF spec
    94  	// Check only one master
    95  	// Number of redis master is 1
    96  	// All redis slaves have the same master
    97  	// All sentinels points to the same redis master
    98  	// Sentinel has not death nodes
    99  	// Sentinel knows the correct slave number
   100  
   101  	if !r.rfChecker.IsRedisRunning(rf) {
   102  		setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
   103  		r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile")
   104  		return nil
   105  	}
   106  
   107  	if !r.rfChecker.IsSentinelRunning(rf) {
   108  		setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
   109  		r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile")
   110  		return nil
   111  	}
   112  
   113  	nMasters, err := r.rfChecker.GetNumberMasters(rf)
   114  	if err != nil {
   115  		return err
   116  	}
   117  
   118  	switch nMasters {
   119  	case 0:
   120  		setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
   121  		//when number of redis replicas is 1 , the redis is configured for standalone master mode
   122  		//Configure to master
   123  		if rf.Spec.Redis.Replicas == 1 {
   124  			r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master")
   125  			err = r.rfHealer.SetOldestAsMaster(rf)
   126  			setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err)
   127  			if err != nil {
   128  				r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
   129  				return err
   130  			}
   131  			return nil
   132  		}
   133  		//During the First boot(New deployment or all pods of the statefulsets have restarted),
   134  		//Sentinesl will not be able to choose the master , so operator should select a master
   135  		//Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached
   136  		//Operator can choose a master , These scenarios can be checked by asking the all the sentinels
   137  		//if its in a postion to choose a master also check if the redis is configured with local host IP as master.
   138  		r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0")
   139  		maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf)
   140  		if err != nil {
   141  			return err
   142  		}
   143  
   144  		r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds())
   145  		//Check If Sentinel has quorum to take a failover decision
   146  		noqrm_cnt, err := r.rfChecker.CheckSentinelQuorum(rf)
   147  		if err != nil {
   148  			// Sentinels are not in a situation to choose a master we pick one
   149  			r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt)
   150  			err2 := r.rfHealer.SetOldestAsMaster(rf)
   151  			setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2)
   152  			if err2 != nil {
   153  				r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
   154  				return err2
   155  			}
   156  		} else {
   157  			//sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master
   158  			status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf)
   159  			if err2 != nil {
   160  				r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later")
   161  				return err2
   162  			} else if status {
   163  				// all avaialable redis pods have local host ip as master
   164  				r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection")
   165  				err3 := r.rfHealer.SetOldestAsMaster(rf)
   166  				setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3)
   167  				if err3 != nil {
   168  					r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
   169  					return err3
   170  				}
   171  
   172  			} else {
   173  
   174  				// We'll wait until failover is done
   175  				r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("no master found, wait until failover or fix manually")
   176  				setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no master not fixed, wait until failover or fix manually"))
   177  				return nil
   178  			}
   179  
   180  		}
   181  
   182  	case 1:
   183  		setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil)
   184  	default:
   185  		setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected"))
   186  		return errors.New("more than one master, fix manually")
   187  	}
   188  
   189  	master, err := r.rfChecker.GetMasterIP(rf)
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	err = r.rfChecker.CheckAllSlavesFromMaster(master, rf)
   195  	setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err)
   196  	if err != nil {
   197  		r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error())
   198  		if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil {
   199  			return err
   200  		}
   201  	}
   202  
   203  	err = r.applyRedisCustomConfig(rf)
   204  	setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err)
   205  	if err != nil {
   206  		return err
   207  	}
   208  
   209  	err = r.UpdateRedisesPods(rf)
   210  	if err != nil {
   211  		return err
   212  	}
   213  
   214  	sentinels, err := r.rfChecker.GetSentinelsIPs(rf)
   215  	if err != nil {
   216  		return err
   217  	}
   218  
   219  	port := getRedisPort(rf.Spec.Redis.Port)
   220  	for _, sip := range sentinels {
   221  		err = r.rfChecker.CheckSentinelMonitor(sip, master, port)
   222  		setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err)
   223  		if err != nil {
   224  			r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error())
   225  			if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil {
   226  				return err
   227  			}
   228  		}
   229  	}
   230  	return r.checkAndHealSentinels(rf, sentinels)
   231  }
   232  
   233  func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error {
   234  
   235  	if !r.rfChecker.IsRedisRunning(rf) {
   236  		setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
   237  		r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile")
   238  		return nil
   239  	}
   240  
   241  	err := r.UpdateRedisesPods(rf)
   242  	if err != nil {
   243  		return err
   244  	}
   245  	err = r.applyRedisCustomConfig(rf)
   246  	setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err)
   247  	if err != nil {
   248  		return err
   249  	}
   250  
   251  	bootstrapSettings := rf.Spec.BootstrapNode
   252  	err = r.rfHealer.SetExternalMasterOnAll(bootstrapSettings.Host, bootstrapSettings.Port, rf)
   253  	setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_EXTERNAL_MASTER, metrics.NOT_APPLICABLE, err)
   254  	if err != nil {
   255  		return err
   256  	}
   257  
   258  	if rf.SentinelsAllowed() {
   259  		if !r.rfChecker.IsSentinelRunning(rf) {
   260  			setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running"))
   261  			r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile")
   262  			return nil
   263  		}
   264  
   265  		sentinels, err := r.rfChecker.GetSentinelsIPs(rf)
   266  		if err != nil {
   267  			return err
   268  		}
   269  		for _, sip := range sentinels {
   270  			err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port)
   271  			setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err)
   272  			if err != nil {
   273  				r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error())
   274  				if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil {
   275  					return err
   276  				}
   277  			}
   278  		}
   279  		return r.checkAndHealSentinels(rf, sentinels)
   280  	}
   281  	return nil
   282  }
   283  
   284  func (r *RedisFailoverHandler) applyRedisCustomConfig(rf *redisfailoverv1.RedisFailover) error {
   285  	redises, err := r.rfChecker.GetRedisesIPs(rf)
   286  	if err != nil {
   287  		return err
   288  	}
   289  	for _, rip := range redises {
   290  		if err := r.rfHealer.SetRedisCustomConfig(rip, rf); err != nil {
   291  			return err
   292  		}
   293  	}
   294  	return nil
   295  }
   296  
   297  func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFailover, sentinels []string) error {
   298  	for _, sip := range sentinels {
   299  		err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf)
   300  		setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err)
   301  		if err != nil {
   302  			r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of sentinels in memory. resetting", sip)
   303  			if err := r.rfHealer.RestoreSentinel(sip); err != nil {
   304  				return err
   305  			}
   306  		}
   307  
   308  	}
   309  	for _, sip := range sentinels {
   310  		err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf)
   311  		setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err)
   312  		if err != nil {
   313  			r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of expected slaves in memory. resetting", sip)
   314  			if err := r.rfHealer.RestoreSentinel(sip); err != nil {
   315  				return err
   316  			}
   317  		}
   318  	}
   319  	for _, sip := range sentinels {
   320  		err := r.rfHealer.SetSentinelCustomConfig(sip, rf)
   321  		setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.APPLY_SENTINEL_CONFIG, sip, err)
   322  		if err != nil {
   323  			return err
   324  		}
   325  	}
   326  	return nil
   327  }
   328  
   329  func getRedisPort(p int32) string {
   330  	return strconv.Itoa(int(p))
   331  }
   332  
   333  func setRedisCheckerMetrics(metricsClient metrics.Recorder, mode /* redis or sentinel? */ string, rfNamespace string, rfName string, property string, IP string, err error) {
   334  	if mode == "sentinel" {
   335  		if err != nil {
   336  			metricsClient.RecordSentinelCheck(rfNamespace, rfName, property, IP, metrics.STATUS_UNHEALTHY)
   337  		} else {
   338  			metricsClient.RecordSentinelCheck(rfNamespace, rfName, property, IP, metrics.STATUS_HEALTHY)
   339  		}
   340  
   341  	} else if mode == "redis" {
   342  		if err != nil {
   343  			metricsClient.RecordRedisCheck(rfNamespace, rfName, property, IP, metrics.STATUS_UNHEALTHY)
   344  		} else {
   345  			metricsClient.RecordRedisCheck(rfNamespace, rfName, property, IP, metrics.STATUS_HEALTHY)
   346  		}
   347  	}
   348  }