github.com/jlmeeker/kismatic@v1.10.1-0.20180612190640-57f9005a1f1a/pkg/install/upgrade.go (about) 1 package install 2 3 import ( 4 "fmt" 5 "strings" 6 7 "github.com/apprenda/kismatic/pkg/data" 8 ) 9 10 type upgradeKubeInfoClient interface { 11 data.PodLister 12 data.DaemonSetGetter 13 data.ReplicationControllerGetter 14 data.ReplicaSetGetter 15 data.PersistentVolumeClaimGetter 16 data.PersistentVolumeGetter 17 data.StatefulSetGetter 18 } 19 20 type etcdNodeCountErr struct{} 21 22 func (e etcdNodeCountErr) Error() string { 23 return "This node is part of an etcd cluster that has less than 3 members. " + 24 "Upgrading it may make the cluster unavailable." 25 } 26 27 type masterNodeCountErr struct{} 28 29 func (e masterNodeCountErr) Error() string { 30 return "This is the only master node in the cluster. " + 31 "Upgrading it may make the cluster unavailable." 32 } 33 34 type masterNodeLoadBalancingErr struct{} 35 36 func (e masterNodeLoadBalancingErr) Error() string { 37 return "This node is acting as the load balanced endpoint for the master nodes. " + 38 "Upgrading it may make the cluster unavailable" 39 } 40 41 type ingressNotSupportedErr struct{} 42 43 func (e ingressNotSupportedErr) Error() string { 44 return "Upgrading this node may result in service unavailability if clients are accessing services directly through this ingress point." 45 } 46 47 type storageNotSupportedErr struct{} 48 49 func (e storageNotSupportedErr) Error() string { 50 return "Upgrading this node may result in storage volumes becoming temporarily unavailable." 51 } 52 53 type workerNodeCountErr struct{} 54 55 func (e workerNodeCountErr) Error() string { 56 return "This is the only worker node in the cluster. " + 57 "Upgrading it may make cluster features unavailable." 58 } 59 60 type podUnsafeVolumeErr struct { 61 namespace string 62 name string 63 volType string 64 volName string 65 } 66 67 func (e podUnsafeVolumeErr) Error() string { 68 return fmt.Sprintf(`Pod "%s/%s" is using %s volume %q, which is unsafe for upgrades.`, e.namespace, e.name, e.volType, e.volName) 69 } 70 71 type podUnsafePersistentVolumeErr struct { 72 namespace string 73 name string 74 volType string 75 volName string 76 } 77 78 func (e podUnsafePersistentVolumeErr) Error() string { 79 return fmt.Sprintf(`Pod "%s/%s" is using volume %q, which is backed by a %s PersistentVolume. `+ 80 `This kind of volume is unsafe for upgrades.`, e.namespace, e.name, e.volName, e.volType) 81 } 82 83 type podUnsafeDaemonErr struct { 84 dsNamespace string 85 dsName string 86 } 87 88 func (e podUnsafeDaemonErr) Error() string { 89 return fmt.Sprintf(`Pod managed by DaemonSet "%s/%s" is running on this node, and no other nodes `+ 90 "are capable of hosting this daemon. Upgrading it may make the daemon unavailable.", e.dsNamespace, e.dsName) 91 } 92 93 type unmanagedPodErr struct { 94 namespace string 95 name string 96 } 97 98 func (e unmanagedPodErr) Error() string { 99 return fmt.Sprintf(`The pod "%s/%s" is not being managed by a controller. `+ 100 "Upgrading this node might result in data or availability loss.", e.namespace, e.name) 101 } 102 103 type unsafeReplicaCountErr struct { 104 kind string 105 namespace string 106 name string 107 } 108 109 func (e unsafeReplicaCountErr) Error() string { 110 return fmt.Sprintf(`Pod managed by %s "%s/%s" is running on this node, `+ 111 "and the %s does not have a replica count greater than 1.", e.kind, e.namespace, e.name, e.kind) 112 } 113 114 type replicasOnSingleNodeErr struct { 115 kind string 116 namespace string 117 name string 118 } 119 120 func (e replicasOnSingleNodeErr) Error() string { 121 return fmt.Sprintf(`All the replicas that belong to the %s "%s/%s" are running on this node.`, e.kind, e.namespace, e.name) 122 } 123 124 type podRunningJobErr struct { 125 namespace string 126 name string 127 } 128 129 func (e podRunningJobErr) Error() string { 130 return fmt.Sprintf(`Pod that belongs to job "%s/%s" is running on this node.`, e.name, e.namespace) 131 } 132 133 // DetectNodeUpgradeSafety determines whether it's safe to upgrade a specific node 134 // listed in the plan file. If any condition that could result in data or availability 135 // loss is detected, the upgrade is deemed unsafe, and the conditions are returned as errors. 136 func DetectNodeUpgradeSafety(plan Plan, node Node, kubeClient upgradeKubeInfoClient) []error { 137 errs := []error{} 138 roles := plan.GetRolesForIP(node.IP) 139 for _, role := range roles { 140 switch role { 141 case "etcd": 142 if plan.Etcd.ExpectedCount < 3 { 143 errs = append(errs, etcdNodeCountErr{}) 144 } 145 case "master": 146 if plan.Master.ExpectedCount < 2 { 147 errs = append(errs, masterNodeCountErr{}) 148 } 149 lbFQDN := plan.Master.LoadBalancedFQDN 150 if lbFQDN == node.Host || lbFQDN == node.IP { 151 errs = append(errs, masterNodeLoadBalancingErr{}) 152 } 153 case "ingress": 154 // we don't control load balancing of ingress nodes. therefore, 155 // upgrading an ingress node is potentially unsafe 156 errs = append(errs, ingressNotSupportedErr{}) 157 case "storage": 158 // we could potentially detect safety of upgrading storage nodes by inspecting 159 // the volumes on the node. for now, we are choosing not to support online upgrade of storage nodes 160 errs = append(errs, storageNotSupportedErr{}) 161 case "worker": 162 if plan.Worker.ExpectedCount < 2 { 163 errs = append(errs, workerNodeCountErr{}) 164 } 165 if workerErrs := detectWorkerNodeUpgradeSafety(node, kubeClient); workerErrs != nil { 166 errs = append(errs, workerErrs...) 167 } 168 } 169 } 170 return errs 171 } 172 173 func detectWorkerNodeUpgradeSafety(node Node, kubeClient upgradeKubeInfoClient) []error { 174 errs := []error{} 175 podList, err := kubeClient.ListPods() 176 if err != nil || podList == nil { 177 errs = append(errs, fmt.Errorf("unable to determine node upgrade safety: %v", err)) 178 return errs 179 } 180 nodePods := []data.Pod{} 181 for _, p := range podList.Items { 182 // Don't check pods that are running in "kube-system" namespace 183 if p.Spec.NodeName == node.Host && p.Namespace != "kube-system" { 184 nodePods = append(nodePods, p) 185 } 186 } 187 188 // Are there any pods using a hostPath, emptyDir volume OR a hostPath PersistentVolume? 189 for _, p := range nodePods { 190 for _, v := range p.Spec.Volumes { 191 if v.VolumeSource.HostPath != nil { 192 errs = append(errs, podUnsafeVolumeErr{namespace: p.Namespace, name: p.Name, volType: "HostPath", volName: v.Name}) 193 } 194 if v.VolumeSource.EmptyDir != nil { 195 errs = append(errs, podUnsafeVolumeErr{namespace: p.Namespace, name: p.Name, volType: "EmptyDir", volName: v.Name}) 196 } 197 if v.VolumeSource.PersistentVolumeClaim != nil { 198 claimRef := v.VolumeSource.PersistentVolumeClaim 199 pvc, err := kubeClient.GetPersistentVolumeClaim(p.Namespace, claimRef.ClaimName) 200 if err != nil || pvc == nil { 201 errs = append(errs, fmt.Errorf(`Failed to get PersistentVolumeClaim "%s/%s."`, p.Namespace, claimRef.ClaimName)) 202 continue 203 } 204 pvName := pvc.Spec.VolumeName 205 pv, err := kubeClient.GetPersistentVolume(pvName) 206 if err != nil || pv == nil { 207 errs = append(errs, fmt.Errorf(`Failed to get PersistentVolume %q. This PV is being used by pod "%s/%s" on this node`, pvName, p.Namespace, p.Name)) 208 continue 209 } 210 if pv.Spec.HostPath != nil { 211 errs = append(errs, podUnsafePersistentVolumeErr{namespace: p.Namespace, name: p.Name, volType: "HostPath", volName: v.Name}) 212 } 213 } 214 } 215 } 216 217 // Keep track of how many pods managed by replication controllers and replicasets 218 // are running on this node. If all replicas are running on the node, we need to 219 // return an error, as it would take the workload down. 220 rcPods := map[string]int32{} 221 rsPods := map[string]int32{} 222 223 // 1. Are there any pods running on this node that are not managed by a controller? 224 // 2. Are there any pods running on this node that are managed by a controller, 225 // and have replicas less than 2? 226 // 3. Are there any daemonset managed pods running on this node? If so, 227 // verify that it is not the only one 228 // 4. Are there any pods that belong to a job running on this node? 229 for _, p := range nodePods { 230 if len(p.ObjectMeta.OwnerReferences) == 0 { 231 errs = append(errs, unmanagedPodErr{namespace: p.Namespace, name: p.Name}) 232 continue 233 } 234 owner := p.ObjectMeta.OwnerReferences[0] 235 if owner.Kind == "" || owner.Name == "" { 236 errs = append(errs, fmt.Errorf("Unable to determine the owner of pod %s/%s", p.Namespace, p.Name)) 237 continue 238 } 239 switch strings.ToLower(owner.Kind) { 240 default: 241 errs = append(errs, fmt.Errorf("Unable to determine upgrade safety for a pod managed by a controller of type %q", owner.Kind)) 242 case "daemonset": 243 ds, err := kubeClient.GetDaemonSet(p.Namespace, owner.Name) 244 if err != nil || ds == nil { 245 errs = append(errs, fmt.Errorf("Failed to get information about DaemonSet %s/%s", p.Namespace, owner.Name)) 246 continue 247 } 248 // Check if other nodes should be running this DS 249 if ds.Status.DesiredNumberScheduled < 2 { 250 errs = append(errs, podUnsafeDaemonErr{dsNamespace: p.Namespace, dsName: owner.Name}) 251 } 252 case "job": 253 errs = append(errs, podRunningJobErr{namespace: p.Namespace, name: owner.Name}) 254 case "replicationcontroller": 255 rc, err := kubeClient.GetReplicationController(p.Namespace, owner.Name) 256 if err != nil || rc == nil { 257 errs = append(errs, fmt.Errorf(`Failed to get information about ReplicationController "%s/%s"`, p.Namespace, owner.Name)) 258 continue 259 } 260 if rc.Status.Replicas < 2 { 261 errs = append(errs, unsafeReplicaCountErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name}) 262 } 263 rcPods[p.Namespace+owner.Name]++ 264 if rcPods[p.Namespace+owner.Name] == rc.Status.Replicas { 265 errs = append(errs, replicasOnSingleNodeErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name}) 266 } 267 case "replicaset": 268 rs, err := kubeClient.GetReplicaSet(p.Namespace, owner.Name) 269 if err != nil || rs == nil { 270 errs = append(errs, fmt.Errorf(`Failed to get information about ReplicaSet "%s/%s"`, p.Namespace, owner.Name)) 271 continue 272 } 273 if rs.Status.Replicas < 2 { 274 errs = append(errs, unsafeReplicaCountErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name}) 275 } 276 rsPods[p.Namespace+owner.Name]++ 277 if rsPods[p.Namespace+owner.Name] == rs.Status.Replicas { 278 errs = append(errs, replicasOnSingleNodeErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name}) 279 } 280 case "statefulset": 281 sts, err := kubeClient.GetStatefulSet(p.Namespace, owner.Name) 282 if err != nil || sts == nil { 283 errs = append(errs, fmt.Errorf(`Failed to get information about StatefulSet "%s/%s"`, p.Namespace, owner.Name)) 284 continue 285 } 286 if sts.Status.Replicas < 2 { 287 errs = append(errs, unsafeReplicaCountErr{kind: owner.Kind, namespace: p.Namespace, name: owner.Name}) 288 } 289 } 290 } 291 292 return errs 293 }