sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/workload_cluster_conditions.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package internal 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 24 "github.com/pkg/errors" 25 corev1 "k8s.io/api/core/v1" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/util/sets" 29 ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" 30 31 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 32 controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" 33 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" 34 etcdutil "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd/util" 35 "sigs.k8s.io/cluster-api/util/collections" 36 "sigs.k8s.io/cluster-api/util/conditions" 37 ) 38 39 // UpdateEtcdConditions is responsible for updating machine conditions reflecting the status of all the etcd members. 40 // This operation is best effort, in the sense that in case of problems in retrieving member status, it sets 41 // the condition to Unknown state without returning any error. 42 func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { 43 if controlPlane.IsEtcdManaged() { 44 w.updateManagedEtcdConditions(ctx, controlPlane) 45 return 46 } 47 w.updateExternalEtcdConditions(ctx, controlPlane) 48 } 49 50 func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane *ControlPlane) { 51 // When KCP is not responsible for external etcd, we are reporting only health at KCP level. 52 conditions.MarkTrue(controlPlane.KCP, controlplanev1.EtcdClusterHealthyCondition) 53 54 // TODO: check external etcd for alarms an possibly also for member errors 55 // this requires implementing an new type of etcd client generator given that it is not possible to use nodes 56 // as a source for the etcd endpoint address; the address of the external etcd should be available on the kubeadm configuration. 57 } 58 59 func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { 60 // NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd 61 // as ultimate source of truth for the list of members and for their health. 62 controlPlaneNodes, err := w.getControlPlaneNodes(ctx) 63 if err != nil { 64 conditions.MarkUnknown(controlPlane.KCP, controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list nodes which are hosting the etcd members") 65 for _, m := range controlPlane.Machines { 66 conditions.MarkUnknown(m, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the node which is hosting the etcd member") 67 } 68 return 69 } 70 71 // Update conditions for etcd members on the nodes. 72 var ( 73 // kcpErrors is used to store errors that can't be reported on any machine. 74 kcpErrors []string 75 // clusterID is used to store and compare the etcd's cluster id. 76 clusterID *uint64 77 // members is used to store the list of etcd members and compare with all the other nodes in the cluster. 78 members []*etcd.Member 79 ) 80 81 for _, node := range controlPlaneNodes.Items { 82 // Search for the machine corresponding to the node. 83 var machine *clusterv1.Machine 84 for _, m := range controlPlane.Machines { 85 if m.Status.NodeRef != nil && m.Status.NodeRef.Name == node.Name { 86 machine = m 87 } 88 } 89 90 if machine == nil { 91 // If there are machines still provisioning there is the chance that a chance that a node might be linked to a machine soon, 92 // otherwise report the error at KCP level given that there is no machine to report on. 93 if hasProvisioningMachine(controlPlane.Machines) { 94 continue 95 } 96 kcpErrors = append(kcpErrors, fmt.Sprintf("Control plane node %s does not have a corresponding machine", node.Name)) 97 continue 98 } 99 100 // If the machine is deleting, report all the conditions as deleting 101 if !machine.ObjectMeta.DeletionTimestamp.IsZero() { 102 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "") 103 continue 104 } 105 106 currentMembers, err := w.getCurrentEtcdMembers(ctx, machine, node.Name) 107 if err != nil { 108 continue 109 } 110 111 // Check if the list of members IDs reported is the same as all other members. 112 // NOTE: the first member reporting this information is the baseline for this information. 113 if members == nil { 114 members = currentMembers 115 } 116 if !etcdutil.MemberEqual(members, currentMembers) { 117 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member reports the cluster is composed by members %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(members)) 118 continue 119 } 120 121 // Retrieve the member and check for alarms. 122 // NB. The member for this node always exists given forFirstAvailableNode(node) used above 123 member := etcdutil.MemberForName(currentMembers, node.Name) 124 if member == nil { 125 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member reports the cluster is composed by members %s, but the member itself (%s) is not included", etcdutil.MemberNames(currentMembers), node.Name) 126 continue 127 } 128 if len(member.Alarms) > 0 { 129 alarmList := []string{} 130 for _, alarm := range member.Alarms { 131 switch alarm { 132 case etcd.AlarmOK: 133 continue 134 default: 135 alarmList = append(alarmList, etcd.AlarmTypeName[alarm]) 136 } 137 } 138 if len(alarmList) > 0 { 139 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports alarms: %s", strings.Join(alarmList, ", ")) 140 continue 141 } 142 } 143 144 // Check if the member belongs to the same cluster as all other members. 145 // NOTE: the first member reporting this information is the baseline for this information. 146 if clusterID == nil { 147 clusterID = &member.ClusterID 148 } 149 if *clusterID != member.ClusterID { 150 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "etcd member has cluster ID %d, but all previously seen etcd members have cluster ID %d", member.ClusterID, *clusterID) 151 continue 152 } 153 154 conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) 155 } 156 157 // Make sure that the list of etcd members and machines is consistent. 158 kcpErrors = compareMachinesAndMembers(controlPlane, members, kcpErrors) 159 160 // Aggregate components error from machines at KCP level 161 aggregateFromMachinesToKCP(aggregateFromMachinesToKCPInput{ 162 controlPlane: controlPlane, 163 machineConditions: []clusterv1.ConditionType{controlplanev1.MachineEtcdMemberHealthyCondition}, 164 kcpErrors: kcpErrors, 165 condition: controlplanev1.EtcdClusterHealthyCondition, 166 unhealthyReason: controlplanev1.EtcdClusterUnhealthyReason, 167 unknownReason: controlplanev1.EtcdClusterUnknownReason, 168 note: "etcd member", 169 }) 170 } 171 172 func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) { 173 // Create the etcd Client for the etcd Pod scheduled on the Node 174 etcdClient, err := w.etcdClientGenerator.forFirstAvailableNode(ctx, []string{nodeName}) 175 if err != nil { 176 conditions.MarkUnknown(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to connect to the etcd pod on the %s node: %s", nodeName, err) 177 return nil, errors.Wrapf(err, "failed to get current etcd members: failed to connect to the etcd pod on the %s node", nodeName) 178 } 179 defer etcdClient.Close() 180 181 // While creating a new client, forFirstAvailableNode retrieves the status for the endpoint; check if the endpoint has errors. 182 if len(etcdClient.Errors) > 0 { 183 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", ")) 184 return nil, errors.Errorf("failed to get current etcd members: etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", ")) 185 } 186 187 // Gets the list etcd members known by this member. 188 currentMembers, err := etcdClient.Members(ctx) 189 if err != nil { 190 // NB. We should never be in here, given that we just received answer to the etcd calls included in forFirstAvailableNode; 191 // however, we are considering the calls to Members a signal of etcd not being stable. 192 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed get answer from the etcd member on the %s node", nodeName) 193 return nil, errors.Errorf("failed to get current etcd members: failed get answer from the etcd member on the %s node", nodeName) 194 } 195 196 return currentMembers, nil 197 } 198 199 func compareMachinesAndMembers(controlPlane *ControlPlane, members []*etcd.Member, kcpErrors []string) []string { 200 // NOTE: We run this check only if we actually know the list of members, otherwise the first for loop 201 // could generate a false negative when reporting missing etcd members. 202 if members == nil { 203 return kcpErrors 204 } 205 206 // Check Machine -> Etcd member. 207 for _, machine := range controlPlane.Machines { 208 if machine.Status.NodeRef == nil { 209 continue 210 } 211 found := false 212 for _, member := range members { 213 if machine.Status.NodeRef.Name == member.Name { 214 found = true 215 break 216 } 217 } 218 if !found { 219 conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Missing etcd member") 220 } 221 } 222 223 // Check Etcd member -> Machine. 224 for _, member := range members { 225 found := false 226 for _, machine := range controlPlane.Machines { 227 if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == member.Name { 228 found = true 229 break 230 } 231 } 232 if !found { 233 name := member.Name 234 if name == "" { 235 name = fmt.Sprintf("%d (Name not yet assigned)", member.ID) 236 } 237 kcpErrors = append(kcpErrors, fmt.Sprintf("etcd member %s does not have a corresponding machine", name)) 238 } 239 } 240 return kcpErrors 241 } 242 243 // UpdateStaticPodConditions is responsible for updating machine conditions reflecting the status of all the control plane 244 // components running in a static pod generated by kubeadm. This operation is best effort, in the sense that in case 245 // of problems in retrieving the pod status, it sets the condition to Unknown state without returning any error. 246 func (w *Workload) UpdateStaticPodConditions(ctx context.Context, controlPlane *ControlPlane) { 247 allMachinePodConditions := []clusterv1.ConditionType{ 248 controlplanev1.MachineAPIServerPodHealthyCondition, 249 controlplanev1.MachineControllerManagerPodHealthyCondition, 250 controlplanev1.MachineSchedulerPodHealthyCondition, 251 } 252 if controlPlane.IsEtcdManaged() { 253 allMachinePodConditions = append(allMachinePodConditions, controlplanev1.MachineEtcdPodHealthyCondition) 254 } 255 256 // NOTE: this fun uses control plane nodes from the workload cluster as a source of truth for the current state. 257 controlPlaneNodes, err := w.getControlPlaneNodes(ctx) 258 if err != nil { 259 for i := range controlPlane.Machines { 260 machine := controlPlane.Machines[i] 261 for _, condition := range allMachinePodConditions { 262 conditions.MarkUnknown(machine, condition, controlplanev1.PodInspectionFailedReason, "Failed to get the node which is hosting this component: %v", err) 263 } 264 } 265 conditions.MarkUnknown(controlPlane.KCP, controlplanev1.ControlPlaneComponentsHealthyCondition, controlplanev1.ControlPlaneComponentsInspectionFailedReason, "Failed to list nodes which are hosting control plane components: %v", err) 266 return 267 } 268 269 // Update conditions for control plane components hosted as static pods on the nodes. 270 var kcpErrors []string 271 272 for _, node := range controlPlaneNodes.Items { 273 // Search for the machine corresponding to the node. 274 var machine *clusterv1.Machine 275 for _, m := range controlPlane.Machines { 276 if m.Status.NodeRef != nil && m.Status.NodeRef.Name == node.Name { 277 machine = m 278 break 279 } 280 } 281 282 // If there is no machine corresponding to a node, determine if this is an error or not. 283 if machine == nil { 284 // If there are machines still provisioning there is the chance that a chance that a node might be linked to a machine soon, 285 // otherwise report the error at KCP level given that there is no machine to report on. 286 if hasProvisioningMachine(controlPlane.Machines) { 287 continue 288 } 289 kcpErrors = append(kcpErrors, fmt.Sprintf("Control plane node %s does not have a corresponding machine", node.Name)) 290 continue 291 } 292 293 // If the machine is deleting, report all the conditions as deleting 294 if !machine.ObjectMeta.DeletionTimestamp.IsZero() { 295 for _, condition := range allMachinePodConditions { 296 conditions.MarkFalse(machine, condition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "") 297 } 298 continue 299 } 300 301 // If the node is Unreachable, information about static pods could be stale so set all conditions to unknown. 302 if nodeHasUnreachableTaint(node) { 303 // NOTE: We are assuming unreachable as a temporary condition, leaving to MHC 304 // the responsibility to determine if the node is unhealthy or not. 305 for _, condition := range allMachinePodConditions { 306 conditions.MarkUnknown(machine, condition, controlplanev1.PodInspectionFailedReason, "Node is unreachable") 307 } 308 continue 309 } 310 311 // Otherwise updates static pod based conditions reflecting the status of the underlying object generated by kubeadm. 312 w.updateStaticPodCondition(ctx, machine, node, "kube-apiserver", controlplanev1.MachineAPIServerPodHealthyCondition) 313 w.updateStaticPodCondition(ctx, machine, node, "kube-controller-manager", controlplanev1.MachineControllerManagerPodHealthyCondition) 314 w.updateStaticPodCondition(ctx, machine, node, "kube-scheduler", controlplanev1.MachineSchedulerPodHealthyCondition) 315 if controlPlane.IsEtcdManaged() { 316 w.updateStaticPodCondition(ctx, machine, node, "etcd", controlplanev1.MachineEtcdPodHealthyCondition) 317 } 318 } 319 320 // If there are provisioned machines without corresponding nodes, report this as a failing conditions with SeverityError. 321 for i := range controlPlane.Machines { 322 machine := controlPlane.Machines[i] 323 if machine.Status.NodeRef == nil { 324 continue 325 } 326 found := false 327 for _, node := range controlPlaneNodes.Items { 328 if machine.Status.NodeRef.Name == node.Name { 329 found = true 330 break 331 } 332 } 333 if !found { 334 for _, condition := range allMachinePodConditions { 335 conditions.MarkFalse(machine, condition, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, "Missing node") 336 } 337 } 338 } 339 340 // Aggregate components error from machines at KCP level. 341 aggregateFromMachinesToKCP(aggregateFromMachinesToKCPInput{ 342 controlPlane: controlPlane, 343 machineConditions: allMachinePodConditions, 344 kcpErrors: kcpErrors, 345 condition: controlplanev1.ControlPlaneComponentsHealthyCondition, 346 unhealthyReason: controlplanev1.ControlPlaneComponentsUnhealthyReason, 347 unknownReason: controlplanev1.ControlPlaneComponentsUnknownReason, 348 note: "control plane", 349 }) 350 } 351 352 func hasProvisioningMachine(machines collections.Machines) bool { 353 for _, machine := range machines { 354 if machine.Status.NodeRef == nil { 355 return true 356 } 357 } 358 return false 359 } 360 361 // nodeHasUnreachableTaint returns true if the node has is unreachable from the node controller. 362 func nodeHasUnreachableTaint(node corev1.Node) bool { 363 for _, taint := range node.Spec.Taints { 364 if taint.Key == corev1.TaintNodeUnreachable && taint.Effect == corev1.TaintEffectNoExecute { 365 return true 366 } 367 } 368 return false 369 } 370 371 // updateStaticPodCondition is responsible for updating machine conditions reflecting the status of a component running 372 // in a static pod generated by kubeadm. This operation is best effort, in the sense that in case of problems 373 // in retrieving the pod status, it sets the condition to Unknown state without returning any error. 374 func (w *Workload) updateStaticPodCondition(ctx context.Context, machine *clusterv1.Machine, node corev1.Node, component string, staticPodCondition clusterv1.ConditionType) { 375 // If node ready is unknown there is a good chance that kubelet is not updating mirror pods, so we consider pod status 376 // to be unknown as well without further investigations. 377 if nodeReadyUnknown(node) { 378 conditions.MarkUnknown(machine, staticPodCondition, controlplanev1.PodInspectionFailedReason, "Node Ready condition is unknown, pod data might be stale") 379 return 380 } 381 382 podKey := ctrlclient.ObjectKey{ 383 Namespace: metav1.NamespaceSystem, 384 Name: staticPodName(component, node.Name), 385 } 386 387 pod := corev1.Pod{} 388 if err := w.Client.Get(ctx, podKey, &pod); err != nil { 389 // If there is an error getting the Pod, do not set any conditions. 390 if apierrors.IsNotFound(err) { 391 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodMissingReason, clusterv1.ConditionSeverityError, "Pod %s is missing", podKey.Name) 392 return 393 } 394 conditions.MarkUnknown(machine, staticPodCondition, controlplanev1.PodInspectionFailedReason, "Failed to get pod status") 395 return 396 } 397 398 switch pod.Status.Phase { 399 case corev1.PodPending: 400 // PodPending means the pod has been accepted by the system, but one or more of the containers 401 // has not been started. This logic is trying to surface more details about what is happening in this phase. 402 403 // Check if the container is still to be scheduled 404 // NOTE: This should never happen for static pods, however this check is implemented for completeness. 405 if podCondition(pod, corev1.PodScheduled) != corev1.ConditionTrue { 406 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodProvisioningReason, clusterv1.ConditionSeverityInfo, "Waiting to be scheduled") 407 return 408 } 409 410 // Check if the container is still running init containers 411 // NOTE: As of today there are not init containers in static pods generated by kubeadm, however this check is implemented for completeness. 412 if podCondition(pod, corev1.PodInitialized) != corev1.ConditionTrue { 413 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodProvisioningReason, clusterv1.ConditionSeverityInfo, "Running init containers") 414 return 415 } 416 417 // If there are no error from containers, report provisioning without further details. 418 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodProvisioningReason, clusterv1.ConditionSeverityInfo, "") 419 case corev1.PodRunning: 420 // PodRunning means the pod has been bound to a node and all of the containers have been started. 421 // At least one container is still running or is in the process of being restarted. 422 // This logic is trying to determine if we are actually running or if we are in an intermediate state 423 // like e.g. a container is retarted. 424 425 // PodReady condition means the pod is able to service requests 426 if podCondition(pod, corev1.PodReady) == corev1.ConditionTrue { 427 conditions.MarkTrue(machine, staticPodCondition) 428 return 429 } 430 431 // Surface wait message from containers. 432 // Exception: Since default "restartPolicy" = "Always", a container that exited with error will be in waiting state (not terminated state) 433 // with "CrashLoopBackOff" reason and its LastTerminationState will be non-nil. 434 var containerWaitingMessages []string 435 terminatedWithError := false 436 for _, containerStatus := range pod.Status.ContainerStatuses { 437 if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode != 0 { 438 terminatedWithError = true 439 } 440 if containerStatus.State.Waiting != nil { 441 containerWaitingMessages = append(containerWaitingMessages, containerStatus.State.Waiting.Reason) 442 } 443 } 444 if len(containerWaitingMessages) > 0 { 445 if terminatedWithError { 446 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, strings.Join(containerWaitingMessages, ", ")) 447 return 448 } 449 // Note: Some error cases cannot be caught when container state == "Waiting", 450 // e.g., "waiting.reason: ErrImagePull" is an error, but since LastTerminationState does not exist, this cannot be differentiated from "PodProvisioningReason" 451 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodProvisioningReason, clusterv1.ConditionSeverityInfo, strings.Join(containerWaitingMessages, ", ")) 452 return 453 } 454 455 // Surface errors message from containers. 456 var containerTerminatedMessages []string 457 for _, containerStatus := range pod.Status.ContainerStatuses { 458 if containerStatus.State.Terminated != nil { 459 containerTerminatedMessages = append(containerTerminatedMessages, containerStatus.State.Terminated.Reason) 460 } 461 } 462 if len(containerTerminatedMessages) > 0 { 463 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, strings.Join(containerTerminatedMessages, ", ")) 464 return 465 } 466 467 // If the pod is not yet ready, most probably it is waiting for startup or readiness probes. 468 // Report this as part of the provisioning process because the corresponding control plane component is not ready yet. 469 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodProvisioningReason, clusterv1.ConditionSeverityInfo, "Waiting for startup or readiness probes") 470 case corev1.PodSucceeded: 471 // PodSucceeded means that all containers in the pod have voluntarily terminated 472 // with a container exit code of 0, and the system is not going to restart any of these containers. 473 // NOTE: This should never happen for the static pods running control plane components. 474 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, "All the containers have been terminated") 475 case corev1.PodFailed: 476 // PodFailed means that all containers in the pod have terminated, and at least one container has 477 // terminated in a failure (exited with a non-zero exit code or was stopped by the system). 478 // NOTE: This should never happen for the static pods running control plane components. 479 conditions.MarkFalse(machine, staticPodCondition, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, "All the containers have been terminated") 480 case corev1.PodUnknown: 481 // PodUnknown means that for some reason the state of the pod could not be obtained, typically due 482 // to an error in communicating with the host of the pod. 483 conditions.MarkUnknown(machine, staticPodCondition, controlplanev1.PodInspectionFailedReason, "Pod is reporting unknown status") 484 } 485 } 486 487 func nodeReadyUnknown(node corev1.Node) bool { 488 for _, condition := range node.Status.Conditions { 489 if condition.Type == corev1.NodeReady { 490 return condition.Status == corev1.ConditionUnknown 491 } 492 } 493 return false 494 } 495 496 func podCondition(pod corev1.Pod, condition corev1.PodConditionType) corev1.ConditionStatus { 497 for _, c := range pod.Status.Conditions { 498 if c.Type == condition { 499 return c.Status 500 } 501 } 502 return corev1.ConditionUnknown 503 } 504 505 type aggregateFromMachinesToKCPInput struct { 506 controlPlane *ControlPlane 507 machineConditions []clusterv1.ConditionType 508 kcpErrors []string 509 condition clusterv1.ConditionType 510 unhealthyReason string 511 unknownReason string 512 note string 513 } 514 515 // aggregateFromMachinesToKCP aggregates a group of conditions from machines to KCP. 516 // NOTE: this func follows the same aggregation rules used by conditions.Merge thus giving priority to 517 // errors, then warning, info down to unknown. 518 func aggregateFromMachinesToKCP(input aggregateFromMachinesToKCPInput) { 519 // Aggregates machines for condition status. 520 // NB. A machine could be assigned to many groups, but only the group with the highest severity will be reported. 521 kcpMachinesWithErrors := sets.Set[string]{} 522 kcpMachinesWithWarnings := sets.Set[string]{} 523 kcpMachinesWithInfo := sets.Set[string]{} 524 kcpMachinesWithTrue := sets.Set[string]{} 525 kcpMachinesWithUnknown := sets.Set[string]{} 526 527 for i := range input.controlPlane.Machines { 528 machine := input.controlPlane.Machines[i] 529 for _, condition := range input.machineConditions { 530 if machineCondition := conditions.Get(machine, condition); machineCondition != nil { 531 switch machineCondition.Status { 532 case corev1.ConditionTrue: 533 kcpMachinesWithTrue.Insert(machine.Name) 534 case corev1.ConditionFalse: 535 switch machineCondition.Severity { 536 case clusterv1.ConditionSeverityInfo: 537 kcpMachinesWithInfo.Insert(machine.Name) 538 case clusterv1.ConditionSeverityWarning: 539 kcpMachinesWithWarnings.Insert(machine.Name) 540 case clusterv1.ConditionSeverityError: 541 kcpMachinesWithErrors.Insert(machine.Name) 542 } 543 case corev1.ConditionUnknown: 544 kcpMachinesWithUnknown.Insert(machine.Name) 545 } 546 } 547 } 548 } 549 550 // In case of at least one machine with errors or KCP level errors (nodes without machines), report false, error. 551 if len(kcpMachinesWithErrors) > 0 { 552 input.kcpErrors = append(input.kcpErrors, fmt.Sprintf("Following machines are reporting %s errors: %s", input.note, strings.Join(sets.List(kcpMachinesWithErrors), ", "))) 553 } 554 if len(input.kcpErrors) > 0 { 555 conditions.MarkFalse(input.controlPlane.KCP, input.condition, input.unhealthyReason, clusterv1.ConditionSeverityError, strings.Join(input.kcpErrors, "; ")) 556 return 557 } 558 559 // In case of no errors and at least one machine with warnings, report false, warnings. 560 if len(kcpMachinesWithWarnings) > 0 { 561 conditions.MarkFalse(input.controlPlane.KCP, input.condition, input.unhealthyReason, clusterv1.ConditionSeverityWarning, "Following machines are reporting %s warnings: %s", input.note, strings.Join(sets.List(kcpMachinesWithWarnings), ", ")) 562 return 563 } 564 565 // In case of no errors, no warning, and at least one machine with info, report false, info. 566 if len(kcpMachinesWithInfo) > 0 { 567 conditions.MarkFalse(input.controlPlane.KCP, input.condition, input.unhealthyReason, clusterv1.ConditionSeverityInfo, "Following machines are reporting %s info: %s", input.note, strings.Join(sets.List(kcpMachinesWithInfo), ", ")) 568 return 569 } 570 571 // In case of no errors, no warning, no Info, and at least one machine with true conditions, report true. 572 if len(kcpMachinesWithTrue) > 0 { 573 conditions.MarkTrue(input.controlPlane.KCP, input.condition) 574 return 575 } 576 577 // Otherwise, if there is at least one machine with unknown, report unknown. 578 if len(kcpMachinesWithUnknown) > 0 { 579 conditions.MarkUnknown(input.controlPlane.KCP, input.condition, input.unknownReason, "Following machines are reporting unknown %s status: %s", input.note, strings.Join(sets.List(kcpMachinesWithUnknown), ", ")) 580 return 581 } 582 583 // This last case should happen only if there are no provisioned machines, and thus without conditions. 584 // So there will be no condition at KCP level too. 585 }