k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cmd/kubeadm/app/phases/upgrade/health.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package upgrade 18 19 import ( 20 "context" 21 "fmt" 22 "os" 23 "time" 24 25 "github.com/pkg/errors" 26 27 batchv1 "k8s.io/api/batch/v1" 28 v1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/labels" 31 "k8s.io/apimachinery/pkg/util/sets" 32 "k8s.io/apimachinery/pkg/util/wait" 33 clientset "k8s.io/client-go/kubernetes" 34 "k8s.io/klog/v2" 35 "k8s.io/utils/ptr" 36 37 kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm" 38 "k8s.io/kubernetes/cmd/kubeadm/app/constants" 39 "k8s.io/kubernetes/cmd/kubeadm/app/images" 40 "k8s.io/kubernetes/cmd/kubeadm/app/preflight" 41 "k8s.io/kubernetes/cmd/kubeadm/app/util/output" 42 ) 43 44 // healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output 45 type healthCheck struct { 46 name string 47 client clientset.Interface 48 cfg *kubeadmapi.ClusterConfiguration 49 // f is invoked with a k8s client and a kubeadm ClusterConfiguration passed to it. Should return an optional error 50 f func(clientset.Interface, *kubeadmapi.ClusterConfiguration) error 51 } 52 53 // Check is part of the preflight.Checker interface 54 func (c *healthCheck) Check() (warnings, errors []error) { 55 if err := c.f(c.client, c.cfg); err != nil { 56 return nil, []error{err} 57 } 58 return nil, nil 59 } 60 61 // Name is part of the preflight.Checker interface 62 func (c *healthCheck) Name() string { 63 return c.name 64 } 65 66 // CheckClusterHealth makes sure: 67 // - the cluster can accept a workload 68 // - all control-plane Nodes are Ready 69 // - (if static pod-hosted) that all required Static Pod manifests exist on disk 70 func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error { 71 _, _ = printer.Println("[upgrade] Running cluster health checks") 72 73 healthChecks := []preflight.Checker{ 74 &healthCheck{ 75 name: "CreateJob", 76 client: client, 77 cfg: cfg, 78 f: createJob, 79 }, 80 &healthCheck{ 81 name: "ControlPlaneNodesReady", 82 client: client, 83 f: controlPlaneNodesReady, 84 }, 85 &healthCheck{ 86 name: "StaticPodManifest", 87 f: staticPodManifestHealth, 88 }, 89 } 90 91 return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors) 92 } 93 94 // createJob is a check that verifies that a Job can be created in the cluster 95 func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error { 96 const ( 97 prefix = "upgrade-health-check" 98 fieldSelector = "spec.unschedulable=false" 99 ns = metav1.NamespaceSystem 100 timeout = 15 * time.Second 101 ) 102 var ( 103 err, lastError error 104 ctx = context.Background() 105 nodes *v1.NodeList 106 listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector} 107 ) 108 109 // If client.Discovery().RESTClient() is nil, the fake client is used. 110 // Return early because the kubeadm dryrun dynamic client only handles the core/v1 GroupVersion. 111 if client.Discovery().RESTClient() == nil { 112 fmt.Printf("[upgrade/health] Would create the Job with the prefix %q in namespace %q and wait until it completes\n", prefix, ns) 113 return nil 114 } 115 116 // Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check. 117 err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { 118 nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions) 119 if err != nil { 120 klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err) 121 lastError = err 122 return false, nil 123 } 124 return true, nil 125 }) 126 if err != nil { 127 return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod") 128 } 129 130 if len(nodes.Items) == 0 { 131 klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.") 132 return nil 133 } 134 135 // Prepare Job 136 job := &batchv1.Job{ 137 ObjectMeta: metav1.ObjectMeta{ 138 GenerateName: prefix + "-", 139 Namespace: ns, 140 }, 141 Spec: batchv1.JobSpec{ 142 BackoffLimit: ptr.To[int32](0), 143 TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'. 144 Template: v1.PodTemplateSpec{ 145 Spec: v1.PodSpec{ 146 RestartPolicy: v1.RestartPolicyNever, 147 SecurityContext: &v1.PodSecurityContext{ 148 RunAsUser: ptr.To[int64](999), 149 RunAsGroup: ptr.To[int64](999), 150 RunAsNonRoot: ptr.To(true), 151 }, 152 Tolerations: []v1.Toleration{ 153 { 154 Key: constants.LabelNodeRoleControlPlane, 155 Effect: v1.TaintEffectNoSchedule, 156 }, 157 }, 158 Containers: []v1.Container{ 159 { 160 Name: prefix, 161 Image: images.GetPauseImage(cfg), 162 Args: []string{"-v"}, 163 }, 164 }, 165 }, 166 }, 167 }, 168 } 169 170 // Create the Job, but retry if it fails 171 klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns) 172 var jobName string 173 err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { 174 createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{}) 175 if err != nil { 176 klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err) 177 lastError = err 178 return false, nil 179 } 180 181 jobName = createdJob.Name 182 return true, nil 183 }) 184 if err != nil { 185 return errors.Wrapf(lastError, "could not create a Job with the prefix %q in the namespace %q", prefix, ns) 186 } 187 188 // Wait for the Job to complete 189 err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { 190 job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{}) 191 if err != nil { 192 lastError = err 193 klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err) 194 return false, nil 195 } 196 for _, cond := range job.Status.Conditions { 197 if cond.Type == batchv1.JobComplete { 198 return true, nil 199 } 200 } 201 lastError = errors.Errorf("no condition of type %v", batchv1.JobComplete) 202 klog.V(2).Infof("Job %q in the namespace %q is not yet complete, retrying", jobName, ns) 203 return false, nil 204 }) 205 if err != nil { 206 return errors.Wrapf(lastError, "Job %q in the namespace %q did not complete in %v", jobName, ns, timeout) 207 } 208 209 klog.V(2).Infof("Job %q in the namespace %q completed", jobName, ns) 210 211 return nil 212 } 213 214 // controlPlaneNodesReady checks whether all control-plane Nodes in the cluster are in the Running state 215 func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error { 216 selectorControlPlane := labels.SelectorFromSet(map[string]string{ 217 constants.LabelNodeRoleControlPlane: "", 218 }) 219 nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ 220 LabelSelector: selectorControlPlane.String(), 221 }) 222 if err != nil { 223 return errors.Wrapf(err, "could not list nodes labeled with %q", constants.LabelNodeRoleControlPlane) 224 } 225 226 notReadyControlPlanes := getNotReadyNodes(nodes.Items) 227 if len(notReadyControlPlanes) != 0 { 228 return errors.Errorf("there are NotReady control-planes in the cluster: %v", notReadyControlPlanes) 229 } 230 return nil 231 } 232 233 // staticPodManifestHealth makes sure the required static pods are presents 234 func staticPodManifestHealth(_ clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error { 235 var nonExistentManifests []string 236 for _, component := range constants.ControlPlaneComponents { 237 manifestFile := constants.GetStaticPodFilepath(component, constants.GetStaticPodDirectory()) 238 if _, err := os.Stat(manifestFile); os.IsNotExist(err) { 239 nonExistentManifests = append(nonExistentManifests, manifestFile) 240 } 241 } 242 if len(nonExistentManifests) == 0 { 243 return nil 244 } 245 return errors.Errorf("The control plane seems to be Static Pod-hosted, but some of the manifests don't seem to exist on disk. This probably means you're running 'kubeadm upgrade' on a remote machine, which is not supported for a Static Pod-hosted cluster. Manifest files not found: %v", nonExistentManifests) 246 } 247 248 // getNotReadyNodes returns a string slice of nodes in the cluster that are NotReady 249 func getNotReadyNodes(nodes []v1.Node) []string { 250 var notReadyNodes []string 251 for _, node := range nodes { 252 for _, condition := range node.Status.Conditions { 253 if condition.Type == v1.NodeReady && condition.Status != v1.ConditionTrue { 254 notReadyNodes = append(notReadyNodes, node.ObjectMeta.Name) 255 } 256 } 257 } 258 return notReadyNodes 259 }