github.com/jlmeeker/kismatic@v1.10.1-0.20180612190640-57f9005a1f1a/pkg/install/upgrade.go

github.com/jlmeeker/kismatic@v1.10.1-0.20180612190640-57f9005a1f1a/pkg/install/upgrade.go (about)

     1  package install
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  
     7  	"github.com/apprenda/kismatic/pkg/data"
     8  )
     9  
    10  type upgradeKubeInfoClient interface {
    11  	data.PodLister
    12  	data.DaemonSetGetter
    13  	data.ReplicationControllerGetter
    14  	data.ReplicaSetGetter
    15  	data.PersistentVolumeClaimGetter
    16  	data.PersistentVolumeGetter
    17  	data.StatefulSetGetter
    18  }
    19  
    20  type etcdNodeCountErr struct{}
    21  
    22  func (e etcdNodeCountErr) Error() string {
    23  	return "This node is part of an etcd cluster that has less than 3 members. " +
    24  		"Upgrading it may make the cluster unavailable."
    25  }
    26  
    27  type masterNodeCountErr struct{}
    28  
    29  func (e masterNodeCountErr) Error() string {
    30  	return "This is the only master node in the cluster. " +
    31  		"Upgrading it may make the cluster unavailable."
    32  }
    33  
    34  type masterNodeLoadBalancingErr struct{}
    35  
    36  func (e masterNodeLoadBalancingErr) Error() string {
    37  	return "This node is acting as the load balanced endpoint for the master nodes. " +
    38  		"Upgrading it may make the cluster unavailable"
    39  }
    40  
    41  type ingressNotSupportedErr struct{}
    42  
    43  func (e ingressNotSupportedErr) Error() string {
    44  	return "Upgrading this node may result in service unavailability if clients are accessing services directly through this ingress point."
    45  }
    46  
    47  type storageNotSupportedErr struct{}
    48  
    49  func (e storageNotSupportedErr) Error() string {
    50  	return "Upgrading this node may result in storage volumes becoming temporarily unavailable."
    51  }
    52  
    53  type workerNodeCountErr struct{}
    54  
    55  func (e workerNodeCountErr) Error() string {
    56  	return "This is the only worker node in the cluster. " +
    57  		"Upgrading it may make cluster features unavailable."
    58  }
    59  
    60  type podUnsafeVolumeErr struct {
    61  	namespace string
    62  	name      string
    63  	volType   string
    64  	volName   string
    65  }
    66  
    67  func (e podUnsafeVolumeErr) Error() string {
    68  	return fmt.Sprintf(`Pod "%s/%s" is using %s volume %q, which is unsafe for upgrades.`, e.namespace, e.name, e.volType, e.volName)
    69  }
    70  
    71  type podUnsafePersistentVolumeErr struct {
    72  	namespace string
    73  	name      string
    74  	volType   string
    75  	volName   string
    76  }
    77  
    78  func (e podUnsafePersistentVolumeErr) Error() string {
    79  	return fmt.Sprintf(`Pod "%s/%s" is using volume %q, which is backed by a %s PersistentVolume. `+
    80  		`This kind of volume is unsafe for upgrades.`, e.namespace, e.name, e.volName, e.volType)
    81  }
    82  
    83  type podUnsafeDaemonErr struct {
    84  	dsNamespace string
    85  	dsName      string
    86  }
    87  
    88  func (e podUnsafeDaemonErr) Error() string {
    89  	return fmt.Sprintf(`Pod managed by DaemonSet "%s/%s" is running on this node, and no other nodes `+
    90  		"are capable of hosting this daemon. Upgrading it may make the daemon unavailable.", e.dsNamespace, e.dsName)
    91  }
    92  
    93  type unmanagedPodErr struct {
    94  	namespace string
    95  	name      string
    96  }
    97  
    98  func (e unmanagedPodErr) Error() string {
    99  	return fmt.Sprintf(`The pod "%s/%s" is not being managed by a controller. `+
   100  		"Upgrading this node might result in data or availability loss.", e.namespace, e.name)
   101  }
   102  
   103  type unsafeReplicaCountErr struct {
   104  	kind      string
   105  	namespace string
   106  	name      string
   107  }
   108  
   109  func (e unsafeReplicaCountErr) Error() string {
   110  	return fmt.Sprintf(`Pod managed by %s "%s/%s" is running on this node, `+
   111  		"and the %s does not have a replica count greater than 1.", e.kind, e.namespace, e.name, e.kind)
   112  }
   113  
   114  type replicasOnSingleNodeErr struct {
   115  	kind      string
   116  	namespace string
   117  	name      string
   118  }
   119  
   120  func (e replicasOnSingleNodeErr) Error() string {
   121  	return fmt.Sprintf(`All the replicas that belong to the %s "%s/%s" are running on this node.`, e.kind, e.namespace, e.name)
   122  }
   123  
   124  type podRunningJobErr struct {
   125  	namespace string
   126  	name      string
   127  }
   128  
   129  func (e podRunningJobErr) Error() string {
   130  	return fmt.Sprintf(`Pod that belongs to job "%s/%s" is running on this node.`, e.name, e.namespace)
   131  }
   132  
   133  // DetectNodeUpgradeSafety determines whether it's safe to upgrade a specific node
   134  // listed in the plan file. If any condition that could result in data or availability
   135  // loss is detected, the upgrade is deemed unsafe, and the conditions are returned as errors.
   136  func DetectNodeUpgradeSafety(plan Plan, node Node, kubeClient upgradeKubeInfoClient) []error {
   137  	errs := []error{}
   138  	roles := plan.GetRolesForIP(node.IP)
   139  	for _, role := range roles {
   140  		switch role {
   141  		case "etcd":
   142  			if plan.Etcd.ExpectedCount < 3 {
   143  				errs = append(errs, etcdNodeCountErr{})
   144  			}
   145  		case "master":
   146  			if plan.Master.ExpectedCount < 2 {
   147  				errs = append(errs, masterNodeCountErr{})
   148  			}
   149  			lbFQDN := plan.Master.LoadBalancedFQDN
   150  			if lbFQDN == node.Host || lbFQDN == node.IP {
   151  				errs = append(errs, masterNodeLoadBalancingErr{})
   152  			}
   153  		case "ingress":
   154  			// we don't control load balancing of ingress nodes. therefore,
   155  			// upgrading an ingress node is potentially unsafe
   156  			errs = append(errs, ingressNotSupportedErr{})
   157  		case "storage":
   158  			// we could potentially detect safety of upgrading storage nodes by inspecting
   159  			// the volumes on the node. for now, we are choosing not to support online upgrade of storage nodes
   160  			errs = append(errs, storageNotSupportedErr{})
   161  		case "worker":
   162  			if plan.Worker.ExpectedCount < 2 {
   163  				errs = append(errs, workerNodeCountErr{})
   164  			}
   165  			if workerErrs := detectWorkerNodeUpgradeSafety(node, kubeClient); workerErrs != nil {
   166  				errs = append(errs, workerErrs...)
   167  			}
   168  		}
   169  	}
   170  	return errs
   171  }
   172  
   173  func detectWorkerNodeUpgradeSafety(node Node, kubeClient upgradeKubeInfoClient) []error {
   174  	errs := []error{}
   175  	podList, err := kubeClient.ListPods()
   176  	if err != nil || podList == nil {
   177  		errs = append(errs, fmt.Errorf("unable to determine node upgrade safety: %v", err))
   178  		return errs
   179  	}
   180  	nodePods := []data.Pod{}
   181  	for _, p := range podList.Items {
   182  		// Don't check pods that are running in "kube-system" namespace
   183  		if p.Spec.NodeName == node.Host && p.Namespace != "kube-system" {
   184  			nodePods = append(nodePods, p)
   185  		}
   186  	}
   187  
   188  	// Are there any pods using a hostPath, emptyDir volume OR a hostPath PersistentVolume?
   189  	for _, p := range nodePods {
   190  		for _, v := range p.Spec.Volumes {
   191  			if v.VolumeSource.HostPath != nil {
   192  				errs = append(errs, podUnsafeVolumeErr{namespace: p.Namespace, name: p.Name, volType: "HostPath", volName: v.Name})
   193  			}
   194  			if v.VolumeSource.EmptyDir != nil {
   195  				errs = append(errs, podUnsafeVolumeErr{namespace: p.Namespace, name: p.Name, volType: "EmptyDir", volName: v.Name})
   196  			}
   197  			if v.VolumeSource.PersistentVolumeClaim != nil {
   198  				claimRef := v.VolumeSource.PersistentVolumeClaim
   199  				pvc, err := kubeClient.GetPersistentVolumeClaim(p.Namespace, claimRef.ClaimName)
   200  				if err != nil || pvc == nil {
   201  					errs = append(errs, fmt.Errorf(`Failed to get PersistentVolumeClaim "%s/%s."`, p.Namespace, claimRef.ClaimName))
   202  					continue
   203  				}
   204  				pvName := pvc.Spec.VolumeName
   205  				pv, err := kubeClient.GetPersistentVolume(pvName)
   206  				if err != nil || pv == nil {
   207  					errs = append(errs, fmt.Errorf(`Failed to get PersistentVolume %q. This PV is being used by pod "%s/%s" on this node`, pvName, p.Namespace, p.Name))
   208  					continue
   209  				}
   210  				if pv.Spec.HostPath != nil {
   211  					errs = append(errs, podUnsafePersistentVolumeErr{namespace: p.Namespace, name: p.Name, volType: "HostPath", volName: v.Name})
   212  				}
   213  			}
   214  		}
   215  	}
   216  
   217  	// Keep track of how many pods managed by replication controllers and replicasets
   218  	// are running on this node. If all replicas are running on the node, we need to
   219  	// return an error, as it would take the workload down.
   220  	rcPods := map[string]int32{}
   221  	rsPods := map[string]int32{}
   222  
   223  	// 1. Are there any pods running on this node that are not managed by a controller?
   224  	// 2. Are there any pods running on this node that are managed by a controller,
   225  	//    and have replicas less than 2?
   226  	// 3. Are there any daemonset managed pods running on this node? If so,
   227  	//    verify that it is not the only one
   228  	// 4. Are there any pods that belong to a job running on this node?
   229  	for _, p := range nodePods {
   230  		if len(p.ObjectMeta.OwnerReferences) == 0 {
   231  			errs = append(errs, unmanagedPodErr{namespace: p.Namespace, name: p.Name})
   232  			continue
   233  		}
   234  		owner := p.ObjectMeta.OwnerReferences[0]
   235  		if owner.Kind == "" || owner.Name == "" {
   236  			errs = append(errs, fmt.Errorf("Unable to determine the owner of pod %s/%s", p.Namespace, p.Name))
   237  			continue
   238  		}
   239  		switch strings.ToLower(owner.Kind) {
   240  		default:
   241  			errs = append(errs, fmt.Errorf("Unable to determine upgrade safety for a pod managed by a controller of type %q", owner.Kind))
   242  		case "daemonset":
   243  			ds, err := kubeClient.GetDaemonSet(p.Namespace, owner.Name)
   244  			if err != nil || ds == nil {
   245  				errs = append(errs, fmt.Errorf("Failed to get information about DaemonSet %s/%s", p.Namespace, owner.Name))
   246  				continue
   247  			}
   248  			// Check if other nodes should be running this DS
   249  			if ds.Status.DesiredNumberScheduled < 2 {
   250  				errs = append(errs, podUnsafeDaemonErr{dsNamespace: p.Namespace, dsName: owner.Name})
   251  			}
   252  		case "job":
   253  			errs = append(errs, podRunningJobErr{namespace: p.Namespace, name: owner.Name})
   254  		case "replicationcontroller":
   255  			rc, err := kubeClient.GetReplicationController(p.Namespace, owner.Name)
   256  			if err != nil || rc == nil {
   257  				errs = append(errs, fmt.Errorf(`Failed to get information about ReplicationController "%s/%s"`, p.Namespace, owner.Name))
   258  				continue
   259  			}
   260  			if rc.Status.Replicas < 2 {
   261  				errs = append(errs, unsafeReplicaCountErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name})
   262  			}
   263  			rcPods[p.Namespace+owner.Name]++
   264  			if rcPods[p.Namespace+owner.Name] == rc.Status.Replicas {
   265  				errs = append(errs, replicasOnSingleNodeErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name})
   266  			}
   267  		case "replicaset":
   268  			rs, err := kubeClient.GetReplicaSet(p.Namespace, owner.Name)
   269  			if err != nil || rs == nil {
   270  				errs = append(errs, fmt.Errorf(`Failed to get information about ReplicaSet "%s/%s"`, p.Namespace, owner.Name))
   271  				continue
   272  			}
   273  			if rs.Status.Replicas < 2 {
   274  				errs = append(errs, unsafeReplicaCountErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name})
   275  			}
   276  			rsPods[p.Namespace+owner.Name]++
   277  			if rsPods[p.Namespace+owner.Name] == rs.Status.Replicas {
   278  				errs = append(errs, replicasOnSingleNodeErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name})
   279  			}
   280  		case "statefulset":
   281  			sts, err := kubeClient.GetStatefulSet(p.Namespace, owner.Name)
   282  			if err != nil || sts == nil {
   283  				errs = append(errs, fmt.Errorf(`Failed to get information about StatefulSet "%s/%s"`, p.Namespace, owner.Name))
   284  				continue
   285  			}
   286  			if sts.Status.Replicas < 2 {
   287  				errs = append(errs, unsafeReplicaCountErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name})
   288  			}
   289  		}
   290  	}
   291  
   292  	return errs
   293  }