github.com/spotahome/redis-operator@v1.2.4/operator/redisfailover/service/check.go (about)

     1  package service
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"strconv"
     7  	"time"
     8  
     9  	appsv1 "k8s.io/api/apps/v1"
    10  	corev1 "k8s.io/api/core/v1"
    11  
    12  	redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1"
    13  	"github.com/spotahome/redis-operator/log"
    14  	"github.com/spotahome/redis-operator/metrics"
    15  	"github.com/spotahome/redis-operator/service/k8s"
    16  	"github.com/spotahome/redis-operator/service/redis"
    17  )
    18  
    19  // RedisFailoverCheck defines the interface able to check the correct status of a redis failover
    20  type RedisFailoverCheck interface {
    21  	CheckRedisNumber(rFailover *redisfailoverv1.RedisFailover) error
    22  	CheckSentinelNumber(rFailover *redisfailoverv1.RedisFailover) error
    23  	CheckAllSlavesFromMaster(master string, rFailover *redisfailoverv1.RedisFailover) error
    24  	CheckSentinelNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error
    25  	CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error
    26  	CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error)
    27  	CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error)
    28  	CheckSentinelMonitor(sentinel string, monitor ...string) error
    29  	GetMasterIP(rFailover *redisfailoverv1.RedisFailover) (string, error)
    30  	GetNumberMasters(rFailover *redisfailoverv1.RedisFailover) (int, error)
    31  	GetRedisesIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error)
    32  	GetSentinelsIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error)
    33  	GetMaxRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error)
    34  	GetRedisesSlavesPods(rFailover *redisfailoverv1.RedisFailover) ([]string, error)
    35  	GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error)
    36  	GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error)
    37  	GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error)
    38  	CheckRedisSlavesReady(slaveIP string, rFailover *redisfailoverv1.RedisFailover) (bool, error)
    39  	IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool
    40  	IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool
    41  	IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool
    42  }
    43  
    44  // RedisFailoverChecker is our implementation of RedisFailoverCheck interface
    45  type RedisFailoverChecker struct {
    46  	k8sService    k8s.Services
    47  	redisClient   redis.Client
    48  	logger        log.Logger
    49  	metricsClient metrics.Recorder
    50  }
    51  
    52  // NewRedisFailoverChecker creates an object of the RedisFailoverChecker struct
    53  func NewRedisFailoverChecker(k8sService k8s.Services, redisClient redis.Client, logger log.Logger, metricsClient metrics.Recorder) *RedisFailoverChecker {
    54  	return &RedisFailoverChecker{
    55  		k8sService:    k8sService,
    56  		redisClient:   redisClient,
    57  		logger:        logger,
    58  		metricsClient: metricsClient,
    59  	}
    60  }
    61  
    62  // CheckRedisNumber controlls that the number of deployed redis is the same than the requested on the spec
    63  func (r *RedisFailoverChecker) CheckRedisNumber(rf *redisfailoverv1.RedisFailover) error {
    64  	ss, err := r.k8sService.GetStatefulSet(rf.Namespace, GetRedisName(rf))
    65  	if err != nil {
    66  		return err
    67  	}
    68  	if rf.Spec.Redis.Replicas != *ss.Spec.Replicas {
    69  		return errors.New("number of redis pods differ from specification")
    70  	}
    71  	return nil
    72  }
    73  
    74  // CheckSentinelNumber controlls that the number of deployed sentinel is the same than the requested on the spec
    75  func (r *RedisFailoverChecker) CheckSentinelNumber(rf *redisfailoverv1.RedisFailover) error {
    76  	d, err := r.k8sService.GetDeployment(rf.Namespace, GetSentinelName(rf))
    77  	if err != nil {
    78  		return err
    79  	}
    80  	if rf.Spec.Sentinel.Replicas != *d.Spec.Replicas {
    81  		return errors.New("number of sentinel pods differ from specification")
    82  	}
    83  	return nil
    84  }
    85  
    86  func (r *RedisFailoverChecker) setMasterLabelIfNecessary(namespace string, pod corev1.Pod) error {
    87  	for labelKey, labelValue := range pod.ObjectMeta.Labels {
    88  		if labelKey == redisRoleLabelKey && labelValue == redisRoleLabelMaster {
    89  			return nil
    90  		}
    91  	}
    92  	return r.k8sService.UpdatePodLabels(namespace, pod.ObjectMeta.Name, generateRedisMasterRoleLabel())
    93  }
    94  
    95  func (r *RedisFailoverChecker) setSlaveLabelIfNecessary(namespace string, pod corev1.Pod) error {
    96  	for labelKey, labelValue := range pod.ObjectMeta.Labels {
    97  		if labelKey == redisRoleLabelKey && labelValue == redisRoleLabelSlave {
    98  			return nil
    99  		}
   100  	}
   101  	return r.k8sService.UpdatePodLabels(namespace, pod.ObjectMeta.Name, generateRedisSlaveRoleLabel())
   102  }
   103  
   104  // CheckAllSlavesFromMaster controlls that all slaves have the same master (the real one)
   105  func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redisfailoverv1.RedisFailover) error {
   106  	rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf))
   107  	if err != nil {
   108  		return err
   109  	}
   110  
   111  	password, err := k8s.GetRedisPassword(r.k8sService, rf)
   112  	if err != nil {
   113  		return err
   114  	}
   115  
   116  	rport := getRedisPort(rf.Spec.Redis.Port)
   117  	for _, rp := range rps.Items {
   118  		if rp.Status.PodIP == master {
   119  			err = r.setMasterLabelIfNecessary(rf.Namespace, rp)
   120  			if err != nil {
   121  				return err
   122  			}
   123  		} else {
   124  			err = r.setSlaveLabelIfNecessary(rf.Namespace, rp)
   125  			if err != nil {
   126  				return err
   127  			}
   128  		}
   129  
   130  		slave, err := r.redisClient.GetSlaveOf(rp.Status.PodIP, rport, password)
   131  		if err != nil {
   132  			r.logger.Errorf("Get slave of master failed, maybe this node is not ready, pod ip: %s", rp.Status.PodIP)
   133  			return err
   134  		}
   135  		if slave != "" && slave != master {
   136  			return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave)
   137  		}
   138  	}
   139  	return nil
   140  }
   141  
   142  // CheckSentinelNumberInMemory controls that the provided sentinel has only the living sentinels on its memory.
   143  func (r *RedisFailoverChecker) CheckSentinelNumberInMemory(sentinel string, rf *redisfailoverv1.RedisFailover) error {
   144  	nSentinels, err := r.redisClient.GetNumberSentinelsInMemory(sentinel)
   145  	if err != nil {
   146  		return err
   147  	} else if nSentinels != rf.Spec.Sentinel.Replicas {
   148  		return errors.New("sentinels in memory mismatch")
   149  	}
   150  	return nil
   151  }
   152  
   153  // This function will check if the local host ip is set as the master for all currently available pods
   154  // This  can be used to detect the fresh boot of all the redis pods
   155  // This function returns true if it all available pods have local host ip as master,
   156  // false if atleast one of the ip is not local hostip
   157  // false and error if any function fails
   158  func (r *RedisFailoverChecker) CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) {
   159  
   160  	var lhmaster int = 0
   161  	redisIps, err := r.GetRedisesIPs(rFailover)
   162  	if len(redisIps) == 0 || err != nil {
   163  		r.logger.Warningf("CheckIfMasterLocalhost GetRedisesIPs Failed- unable to fetch any redis Ips Currently")
   164  		return false, errors.New("unable to fetch any redis Ips Currently")
   165  	}
   166  	password, err := k8s.GetRedisPassword(r.k8sService, rFailover)
   167  	if err != nil {
   168  		r.logger.Errorf("CheckIfMasterLocalhost -- GetRedisPassword Failed")
   169  		return false, err
   170  	}
   171  	rport := getRedisPort(rFailover.Spec.Redis.Port)
   172  	for _, sip := range redisIps {
   173  		master, err := r.redisClient.GetSlaveOf(sip, rport, password)
   174  		if err != nil {
   175  			r.logger.Warningf("CheckIfMasterLocalhost -- GetSlaveOf Failed")
   176  			return false, err
   177  		} else if master == "" {
   178  			r.logger.Warningf("CheckIfMasterLocalhost -- Master already available ?? check manually")
   179  			return false, errors.New("unexpected master state, fix manually")
   180  		} else {
   181  			if master == "127.0.0.1" {
   182  				lhmaster++
   183  			}
   184  		}
   185  	}
   186  	if lhmaster == len(redisIps) {
   187  		r.logger.Infof("all available redis configured localhost as master , opertor must heal")
   188  		return true, nil
   189  	}
   190  	r.logger.Infof("atleast one pod does not have localhost as master , opertor should not heal")
   191  	return false, nil
   192  }
   193  
   194  // This function will call the sentinel client apis to check with sentinel if the sentinel is in a state
   195  // to heal the redis system
   196  func (r *RedisFailoverChecker) CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) {
   197  
   198  	var unhealthyCnt int = -1
   199  
   200  	sentinels, err := r.GetSentinelsIPs(rFailover)
   201  	if err != nil {
   202  		r.logger.Warningf("CheckSentinelQuorum Error in getting sentinel Ip's")
   203  		return unhealthyCnt, err
   204  	}
   205  	if len(sentinels) < int(getQuorum(rFailover)) {
   206  		unhealthyCnt = int(getQuorum(rFailover)) - len(sentinels)
   207  		r.logger.Warningf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt)
   208  		return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum")
   209  	}
   210  
   211  	unhealthyCnt = 0
   212  	for _, sip := range sentinels {
   213  		err = r.redisClient.SentinelCheckQuorum(sip)
   214  		if err != nil {
   215  			unhealthyCnt += 1
   216  		} else {
   217  			continue
   218  		}
   219  	}
   220  	if unhealthyCnt < int(getQuorum(rFailover)) {
   221  		return unhealthyCnt, nil
   222  	} else {
   223  		r.logger.Errorf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt)
   224  		return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum")
   225  	}
   226  }
   227  
   228  // CheckSentinelSlavesNumberInMemory controls that the provided sentinel has only the expected slaves number.
   229  func (r *RedisFailoverChecker) CheckSentinelSlavesNumberInMemory(sentinel string, rf *redisfailoverv1.RedisFailover) error {
   230  	nSlaves, err := r.redisClient.GetNumberSentinelSlavesInMemory(sentinel)
   231  	if err != nil {
   232  		return err
   233  	} else {
   234  		if rf.Bootstrapping() {
   235  			if nSlaves != rf.Spec.Redis.Replicas {
   236  				return errors.New("redis slaves in sentinel memory mismatch")
   237  			}
   238  		} else {
   239  			if nSlaves != rf.Spec.Redis.Replicas-1 {
   240  				return errors.New("redis slaves in sentinel memory mismatch")
   241  			}
   242  		}
   243  	}
   244  	return nil
   245  
   246  }
   247  
   248  // CheckSentinelMonitor controls if the sentinels are monitoring the expected master
   249  func (r *RedisFailoverChecker) CheckSentinelMonitor(sentinel string, monitor ...string) error {
   250  	monitorIP := monitor[0]
   251  	monitorPort := ""
   252  	if len(monitor) > 1 {
   253  		monitorPort = monitor[1]
   254  	}
   255  	actualMonitorIP, actualMonitorPort, err := r.redisClient.GetSentinelMonitor(sentinel)
   256  	if err != nil {
   257  		return err
   258  	}
   259  	if actualMonitorIP != monitorIP || (monitorPort != "" && monitorPort != actualMonitorPort) {
   260  		return fmt.Errorf("sentinel monitoring %s:%s instead %s:%s", actualMonitorIP, actualMonitorPort, monitorIP, monitorPort)
   261  	}
   262  	return nil
   263  }
   264  
   265  // GetMasterIP connects to all redis and returns the master of the redis failover
   266  func (r *RedisFailoverChecker) GetMasterIP(rf *redisfailoverv1.RedisFailover) (string, error) {
   267  	rips, err := r.GetRedisesIPs(rf)
   268  	if err != nil {
   269  		return "", err
   270  	}
   271  
   272  	password, err := k8s.GetRedisPassword(r.k8sService, rf)
   273  	if err != nil {
   274  		return "", err
   275  	}
   276  
   277  	masters := []string{}
   278  	rport := getRedisPort(rf.Spec.Redis.Port)
   279  	for _, rip := range rips {
   280  		master, err := r.redisClient.IsMaster(rip, rport, password)
   281  		if err != nil {
   282  			r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip)
   283  			continue
   284  		}
   285  		if master {
   286  			masters = append(masters, rip)
   287  		}
   288  	}
   289  
   290  	if len(masters) != 1 {
   291  		return "", errors.New("number of redis nodes known as master is different than 1")
   292  	}
   293  	return masters[0], nil
   294  }
   295  
   296  // GetNumberMasters returns the number of redis nodes that are working as a master
   297  func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailover) (int, error) {
   298  	nMasters := 0
   299  	rips, err := r.GetRedisesIPs(rf)
   300  	if err != nil {
   301  		r.logger.Errorf(err.Error())
   302  		return nMasters, err
   303  	}
   304  
   305  	password, err := k8s.GetRedisPassword(r.k8sService, rf)
   306  	if err != nil {
   307  		r.logger.Errorf("Error getting password: %s", err.Error())
   308  		return nMasters, err
   309  	}
   310  
   311  	rport := getRedisPort(rf.Spec.Redis.Port)
   312  	for _, rip := range rips {
   313  		master, err := r.redisClient.IsMaster(rip, rport, password)
   314  		if err != nil {
   315  			r.logger.Errorf("Get redis info failed, maybe this node is not ready, pod ip: %s", rip)
   316  			continue
   317  		}
   318  		if master {
   319  			nMasters++
   320  		}
   321  	}
   322  	return nMasters, nil
   323  }
   324  
   325  // GetRedisesIPs returns the IPs of the Redis nodes
   326  func (r *RedisFailoverChecker) GetRedisesIPs(rf *redisfailoverv1.RedisFailover) ([]string, error) {
   327  	redises := []string{}
   328  	rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf))
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  	for _, rp := range rps.Items {
   333  		if rp.Status.Phase == corev1.PodRunning && rp.DeletionTimestamp == nil { // Only work with running pods
   334  			redises = append(redises, rp.Status.PodIP)
   335  		}
   336  	}
   337  	return redises, nil
   338  }
   339  
   340  // GetSentinelsIPs returns the IPs of the Sentinel nodes
   341  func (r *RedisFailoverChecker) GetSentinelsIPs(rf *redisfailoverv1.RedisFailover) ([]string, error) {
   342  	sentinels := []string{}
   343  	rps, err := r.k8sService.GetDeploymentPods(rf.Namespace, GetSentinelName(rf))
   344  	if err != nil {
   345  		return nil, err
   346  	}
   347  	for _, sp := range rps.Items {
   348  		if sp.Status.Phase == corev1.PodRunning && sp.DeletionTimestamp == nil { // Only work with running pods
   349  			sentinels = append(sentinels, sp.Status.PodIP)
   350  		}
   351  	}
   352  	return sentinels, nil
   353  }
   354  
   355  // GetMaxRedisPodTime returns the MAX uptime among the active Pods
   356  func (r *RedisFailoverChecker) GetMaxRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) {
   357  	maxTime := 0 * time.Hour
   358  	rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf))
   359  	if err != nil {
   360  		return maxTime, err
   361  	}
   362  	for _, redisNode := range rps.Items {
   363  		if redisNode.Status.StartTime == nil {
   364  			continue
   365  		}
   366  		start := redisNode.Status.StartTime.Round(time.Second)
   367  		alive := time.Since(start)
   368  		r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds())
   369  		if alive > maxTime {
   370  			maxTime = alive
   371  		}
   372  	}
   373  	return maxTime, nil
   374  }
   375  
   376  // GetRedisesSlavesPods returns pods names of the Redis slave nodes
   377  func (r *RedisFailoverChecker) GetRedisesSlavesPods(rf *redisfailoverv1.RedisFailover) ([]string, error) {
   378  	redises := []string{}
   379  	rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf))
   380  	if err != nil {
   381  		return nil, err
   382  	}
   383  
   384  	password, err := k8s.GetRedisPassword(r.k8sService, rf)
   385  	if err != nil {
   386  		return redises, err
   387  	}
   388  
   389  	rport := getRedisPort(rf.Spec.Redis.Port)
   390  	for _, rp := range rps.Items {
   391  		if rp.Status.Phase == corev1.PodRunning && rp.DeletionTimestamp == nil { // Only work with running
   392  			master, err := r.redisClient.IsMaster(rp.Status.PodIP, rport, password)
   393  			if err != nil {
   394  				return []string{}, err
   395  			}
   396  			if !master {
   397  				redises = append(redises, rp.ObjectMeta.Name)
   398  			}
   399  		}
   400  	}
   401  	return redises, nil
   402  }
   403  
   404  // GetRedisesMasterPod returns pods names of the Redis slave nodes
   405  func (r *RedisFailoverChecker) GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) {
   406  	rps, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover))
   407  	if err != nil {
   408  		return "", err
   409  	}
   410  
   411  	password, err := k8s.GetRedisPassword(r.k8sService, rFailover)
   412  	if err != nil {
   413  		return "", err
   414  	}
   415  
   416  	rport := getRedisPort(rFailover.Spec.Redis.Port)
   417  	for _, rp := range rps.Items {
   418  		if rp.Status.Phase == corev1.PodRunning && rp.DeletionTimestamp == nil { // Only work with running
   419  			master, err := r.redisClient.IsMaster(rp.Status.PodIP, rport, password)
   420  			if err != nil {
   421  				return "", err
   422  			}
   423  			if master {
   424  				return rp.ObjectMeta.Name, nil
   425  			}
   426  		}
   427  	}
   428  	return "", errors.New("redis nodes known as master not found")
   429  }
   430  
   431  // GetStatefulSetUpdateRevision returns current version for the statefulSet
   432  // If the label don't exists, we return an empty value and no error, so previous versions don't break
   433  func (r *RedisFailoverChecker) GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) {
   434  	ss, err := r.k8sService.GetStatefulSet(rFailover.Namespace, GetRedisName(rFailover))
   435  	if err != nil {
   436  		return "", err
   437  	}
   438  
   439  	if ss == nil {
   440  		return "", errors.New("statefulSet not found")
   441  	}
   442  
   443  	return ss.Status.UpdateRevision, nil
   444  }
   445  
   446  // GetRedisRevisionHash returns the statefulset uid for the pod
   447  func (r *RedisFailoverChecker) GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error) {
   448  	pod, err := r.k8sService.GetPod(rFailover.Namespace, podName)
   449  	if err != nil {
   450  		return "", err
   451  	}
   452  
   453  	if pod == nil {
   454  		return "", errors.New("pod not found")
   455  	}
   456  
   457  	if pod.ObjectMeta.Labels == nil {
   458  		return "", errors.New("labels not found")
   459  	}
   460  
   461  	val := pod.ObjectMeta.Labels[appsv1.ControllerRevisionHashLabelKey]
   462  
   463  	return val, nil
   464  }
   465  
   466  // CheckRedisSlavesReady returns true if the slave is ready (sync, connected, etc)
   467  func (r *RedisFailoverChecker) CheckRedisSlavesReady(ip string, rFailover *redisfailoverv1.RedisFailover) (bool, error) {
   468  	password, err := k8s.GetRedisPassword(r.k8sService, rFailover)
   469  	if err != nil {
   470  		return false, err
   471  	}
   472  
   473  	port := getRedisPort(rFailover.Spec.Redis.Port)
   474  	return r.redisClient.SlaveIsReady(ip, port, password)
   475  }
   476  
   477  // IsRedisRunning returns true if all the pods are Running
   478  func (r *RedisFailoverChecker) IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool {
   479  	dp, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover))
   480  	return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp)
   481  }
   482  
   483  // IsSentinelRunning returns true if all the pods are Running
   484  func (r *RedisFailoverChecker) IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool {
   485  	dp, err := r.k8sService.GetDeploymentPods(rFailover.Namespace, GetSentinelName(rFailover))
   486  	return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp)
   487  }
   488  
   489  // IsClusterRunning returns true if all the pods in the given redisfailover are Running
   490  func (r *RedisFailoverChecker) IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool {
   491  	return r.IsSentinelRunning(rFailover) && r.IsRedisRunning(rFailover)
   492  }
   493  
   494  func getRedisPort(p int32) string {
   495  	return strconv.Itoa(int(p))
   496  }
   497  
   498  func AreAllRunning(pods *corev1.PodList) bool {
   499  	for _, pod := range pods.Items {
   500  		if pod.Status.Phase != corev1.PodRunning || pod.DeletionTimestamp != nil {
   501  			return false
   502  		}
   503  	}
   504  	return true
   505  }