sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/go-logr/logr" 28 "github.com/pkg/errors" 29 corev1 "k8s.io/api/core/v1" 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 33 "k8s.io/apimachinery/pkg/types" 34 kerrors "k8s.io/apimachinery/pkg/util/errors" 35 "k8s.io/apimachinery/pkg/util/intstr" 36 "k8s.io/client-go/tools/record" 37 "k8s.io/klog/v2" 38 ctrl "sigs.k8s.io/controller-runtime" 39 "sigs.k8s.io/controller-runtime/pkg/builder" 40 "sigs.k8s.io/controller-runtime/pkg/client" 41 "sigs.k8s.io/controller-runtime/pkg/controller" 42 "sigs.k8s.io/controller-runtime/pkg/handler" 43 "sigs.k8s.io/controller-runtime/pkg/reconcile" 44 45 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 46 "sigs.k8s.io/cluster-api/api/v1beta1/index" 47 "sigs.k8s.io/cluster-api/controllers/external" 48 "sigs.k8s.io/cluster-api/controllers/remote" 49 "sigs.k8s.io/cluster-api/internal/controllers/machine" 50 "sigs.k8s.io/cluster-api/util" 51 "sigs.k8s.io/cluster-api/util/annotations" 52 "sigs.k8s.io/cluster-api/util/conditions" 53 "sigs.k8s.io/cluster-api/util/patch" 54 "sigs.k8s.io/cluster-api/util/predicates" 55 ) 56 57 const ( 58 // Event types. 59 60 // EventRemediationRestricted is emitted in case when machine remediation 61 // is restricted by remediation circuit shorting logic. 62 EventRemediationRestricted string = "RemediationRestricted" 63 64 maxUnhealthyKeyLog = "max unhealthy" 65 unhealthyTargetsKeyLog = "unhealthy targets" 66 unhealthyRangeKeyLog = "unhealthy range" 67 totalTargetKeyLog = "total target" 68 ) 69 70 // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch 71 // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch 72 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;delete 73 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinehealthchecks;machinehealthchecks/status;machinehealthchecks/finalizers,verbs=get;list;watch;update;patch 74 75 // Reconciler reconciles a MachineHealthCheck object. 76 type Reconciler struct { 77 Client client.Client 78 Tracker *remote.ClusterCacheTracker 79 80 // WatchFilterValue is the label value used to filter events prior to reconciliation. 81 WatchFilterValue string 82 83 controller controller.Controller 84 recorder record.EventRecorder 85 } 86 87 func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { 88 c, err := ctrl.NewControllerManagedBy(mgr). 89 For(&clusterv1.MachineHealthCheck{}). 90 Watches( 91 &clusterv1.Machine{}, 92 handler.EnqueueRequestsFromMapFunc(r.machineToMachineHealthCheck), 93 ). 94 WithOptions(options). 95 WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). 96 Watches( 97 &clusterv1.Cluster{}, 98 handler.EnqueueRequestsFromMapFunc(r.clusterToMachineHealthCheck), 99 builder.WithPredicates( 100 // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? 101 predicates.All(ctrl.LoggerFrom(ctx), 102 predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)), 103 predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue), 104 ), 105 ), 106 ).Build(r) 107 if err != nil { 108 return errors.Wrap(err, "failed setting up with a controller manager") 109 } 110 111 r.controller = c 112 r.recorder = mgr.GetEventRecorderFor("machinehealthcheck-controller") 113 return nil 114 } 115 116 func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 117 log := ctrl.LoggerFrom(ctx) 118 119 // Fetch the MachineHealthCheck instance 120 m := &clusterv1.MachineHealthCheck{} 121 if err := r.Client.Get(ctx, req.NamespacedName, m); err != nil { 122 if apierrors.IsNotFound(err) { 123 // Object not found, return. Created objects are automatically garbage collected. 124 // For additional cleanup logic use finalizers. 125 return ctrl.Result{}, nil 126 } 127 128 // Error reading the object - requeue the request. 129 log.Error(err, "Failed to fetch MachineHealthCheck") 130 return ctrl.Result{}, err 131 } 132 133 log = log.WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName)) 134 ctx = ctrl.LoggerInto(ctx, log) 135 136 cluster, err := util.GetClusterByName(ctx, r.Client, m.Namespace, m.Spec.ClusterName) 137 if err != nil { 138 log.Error(err, "Failed to fetch Cluster for MachineHealthCheck") 139 return ctrl.Result{}, err 140 } 141 142 // Return early if the object or Cluster is paused. 143 if annotations.IsPaused(cluster, m) { 144 log.Info("Reconciliation is paused for this object") 145 return ctrl.Result{}, nil 146 } 147 148 // Initialize the patch helper 149 patchHelper, err := patch.NewHelper(m, r.Client) 150 if err != nil { 151 return ctrl.Result{}, err 152 } 153 154 defer func() { 155 // Always attempt to patch the object and status after each reconciliation. 156 // Patch ObservedGeneration only if the reconciliation completed successfully 157 patchOpts := []patch.Option{} 158 if reterr == nil { 159 patchOpts = append(patchOpts, patch.WithStatusObservedGeneration{}) 160 } 161 if err := patchHelper.Patch(ctx, m, patchOpts...); err != nil { 162 reterr = kerrors.NewAggregate([]error{reterr, err}) 163 } 164 }() 165 166 // Reconcile labels. 167 if m.Labels == nil { 168 m.Labels = make(map[string]string) 169 } 170 m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName 171 172 result, err := r.reconcile(ctx, log, cluster, m) 173 if err != nil { 174 // Requeue if the reconcile failed because the ClusterCacheTracker was locked for 175 // the current cluster because of concurrent access. 176 if errors.Is(err, remote.ErrClusterLocked) { 177 log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") 178 return ctrl.Result{RequeueAfter: time.Minute}, nil 179 } 180 log.Error(err, "Failed to reconcile MachineHealthCheck") 181 r.recorder.Eventf(m, corev1.EventTypeWarning, "ReconcileError", "%v", err) 182 183 // Requeue immediately if any errors occurred 184 return ctrl.Result{}, err 185 } 186 187 return result, nil 188 } 189 190 func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) { 191 // Ensure the MachineHealthCheck is owned by the Cluster it belongs to 192 m.SetOwnerReferences(util.EnsureOwnerRef(m.GetOwnerReferences(), metav1.OwnerReference{ 193 APIVersion: clusterv1.GroupVersion.String(), 194 Kind: "Cluster", 195 Name: cluster.Name, 196 UID: cluster.UID, 197 })) 198 199 // If the cluster is already initialized, get the remote cluster cache to use as a client.Reader. 200 var remoteClient client.Client 201 if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { 202 var err error 203 remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) 204 if err != nil { 205 logger.Error(err, "error creating remote cluster cache") 206 return ctrl.Result{}, err 207 } 208 209 if err := r.watchClusterNodes(ctx, cluster); err != nil { 210 return ctrl.Result{}, err 211 } 212 } 213 214 // fetch all targets 215 logger.V(3).Info("Finding targets") 216 targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, cluster, m) 217 if err != nil { 218 logger.Error(err, "Failed to fetch targets from MachineHealthCheck") 219 return ctrl.Result{}, err 220 } 221 totalTargets := len(targets) 222 m.Status.ExpectedMachines = int32(totalTargets) 223 m.Status.Targets = make([]string, totalTargets) 224 for i, t := range targets { 225 m.Status.Targets[i] = t.Machine.Name 226 } 227 // do sort to avoid keep changing m.Status as the returned machines are not in order 228 sort.Strings(m.Status.Targets) 229 230 nodeStartupTimeout := m.Spec.NodeStartupTimeout 231 if nodeStartupTimeout == nil { 232 nodeStartupTimeout = &clusterv1.DefaultNodeStartupTimeout 233 } 234 235 // health check all targets and reconcile mhc status 236 healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, *nodeStartupTimeout) 237 m.Status.CurrentHealthy = int32(len(healthy)) 238 239 // check MHC current health against MaxUnhealthy 240 remediationAllowed, remediationCount, err := isAllowedRemediation(m) 241 if err != nil { 242 return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed") 243 } 244 245 if !remediationAllowed { 246 var message string 247 248 if m.Spec.UnhealthyRange == nil { 249 logger.V(3).Info( 250 "Short-circuiting remediation", 251 totalTargetKeyLog, totalTargets, 252 maxUnhealthyKeyLog, m.Spec.MaxUnhealthy, 253 unhealthyTargetsKeyLog, len(unhealthy), 254 ) 255 message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", 256 totalTargets, 257 len(unhealthy), 258 m.Spec.MaxUnhealthy) 259 } else { 260 logger.V(3).Info( 261 "Short-circuiting remediation", 262 totalTargetKeyLog, totalTargets, 263 unhealthyRangeKeyLog, *m.Spec.UnhealthyRange, 264 unhealthyTargetsKeyLog, len(unhealthy), 265 ) 266 message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)", 267 totalTargets, 268 len(unhealthy), 269 *m.Spec.UnhealthyRange) 270 } 271 272 // Remediation not allowed, the number of not started or unhealthy machines either exceeds maxUnhealthy (or) not within unhealthyRange 273 m.Status.RemediationsAllowed = 0 274 conditions.Set(m, &clusterv1.Condition{ 275 Type: clusterv1.RemediationAllowedCondition, 276 Status: corev1.ConditionFalse, 277 Severity: clusterv1.ConditionSeverityWarning, 278 Reason: clusterv1.TooManyUnhealthyReason, 279 Message: message, 280 }) 281 282 r.recorder.Event( 283 m, 284 corev1.EventTypeWarning, 285 EventRemediationRestricted, 286 message, 287 ) 288 errList := []error{} 289 for _, t := range append(healthy, unhealthy...) { 290 if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { 291 errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) 292 continue 293 } 294 } 295 if len(errList) > 0 { 296 return ctrl.Result{}, kerrors.NewAggregate(errList) 297 } 298 return reconcile.Result{Requeue: true}, nil 299 } 300 301 if m.Spec.UnhealthyRange == nil { 302 logger.V(3).Info( 303 "Remediations are allowed", 304 totalTargetKeyLog, totalTargets, 305 maxUnhealthyKeyLog, m.Spec.MaxUnhealthy, 306 unhealthyTargetsKeyLog, len(unhealthy), 307 ) 308 } else { 309 logger.V(3).Info( 310 "Remediations are allowed", 311 totalTargetKeyLog, totalTargets, 312 unhealthyRangeKeyLog, *m.Spec.UnhealthyRange, 313 unhealthyTargetsKeyLog, len(unhealthy), 314 ) 315 } 316 317 // Remediation is allowed so unhealthyMachineCount is within unhealthyRange (or) maxUnhealthy - unhealthyMachineCount >= 0 318 m.Status.RemediationsAllowed = remediationCount 319 conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition) 320 321 errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m) 322 errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...) 323 324 // handle update errors 325 if len(errList) > 0 { 326 logger.V(3).Info("Error(s) marking machine, requeuing") 327 return reconcile.Result{}, kerrors.NewAggregate(errList) 328 } 329 330 if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 { 331 logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String()) 332 return ctrl.Result{RequeueAfter: minNextCheck}, nil 333 } 334 335 logger.V(3).Info("No more targets meet unhealthy criteria") 336 337 return ctrl.Result{}, nil 338 } 339 340 // patchHealthyTargets patches healthy machines with MachineHealthCheckSucceededCondition. 341 func (r *Reconciler) patchHealthyTargets(ctx context.Context, logger logr.Logger, healthy []healthCheckTarget, m *clusterv1.MachineHealthCheck) []error { 342 errList := []error{} 343 for _, t := range healthy { 344 if m.Spec.RemediationTemplate != nil { 345 // Get remediation request object 346 obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name) 347 if err != nil { 348 if !apierrors.IsNotFound(errors.Cause(err)) { 349 wrappedErr := errors.Wrapf(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName) 350 errList = append(errList, wrappedErr) 351 } 352 continue 353 } 354 // Check that obj has no DeletionTimestamp to avoid hot loop 355 if obj.GetDeletionTimestamp() == nil { 356 // Issue a delete for remediation request. 357 if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { 358 errList = append(errList, errors.Wrapf(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name)) 359 continue 360 } 361 } 362 } 363 364 if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { 365 logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName()) 366 errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) 367 } 368 } 369 return errList 370 } 371 372 // patchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation. 373 func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logger, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error { 374 // mark for remediation 375 errList := []error{} 376 for _, t := range unhealthy { 377 condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSucceededCondition) 378 379 if annotations.IsPaused(cluster, t.Machine) { 380 logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) 381 } else { 382 if m.Spec.RemediationTemplate != nil { 383 // If external remediation request already exists, 384 // return early 385 if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) { 386 return errList 387 } 388 389 cloneOwnerRef := &metav1.OwnerReference{ 390 APIVersion: clusterv1.GroupVersion.String(), 391 Kind: "Machine", 392 Name: t.Machine.Name, 393 UID: t.Machine.UID, 394 } 395 396 from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace) 397 if err != nil { 398 conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error()) 399 errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) 400 return errList 401 } 402 403 generateTemplateInput := &external.GenerateTemplateInput{ 404 Template: from, 405 TemplateRef: m.Spec.RemediationTemplate, 406 Namespace: t.Machine.Namespace, 407 ClusterName: t.Machine.Spec.ClusterName, 408 OwnerRef: cloneOwnerRef, 409 } 410 to, err := external.GenerateTemplate(generateTemplateInput) 411 if err != nil { 412 errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) 413 return errList 414 } 415 416 // Set the Remediation Request to match the Machine name, the name is used to 417 // guarantee uniqueness between runs. A Machine should only ever have a single 418 // remediation object of a specific GVK created. 419 // 420 // NOTE: This doesn't guarantee uniqueness across different MHC objects watching 421 // the same Machine, users are in charge of setting health checks and remediation properly. 422 to.SetName(t.Machine.Name) 423 424 logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message) 425 // Create the external clone. 426 if err := r.Client.Create(ctx, to); err != nil { 427 conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 428 errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName)) 429 return errList 430 } 431 } else { 432 logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) 433 // NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed; 434 // instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition. 435 if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) { 436 conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 437 } 438 } 439 } 440 441 if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { 442 errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) 443 continue 444 } 445 r.recorder.Eventf( 446 t.Machine, 447 corev1.EventTypeNormal, 448 EventMachineMarkedUnhealthy, 449 "Machine %v has been marked as unhealthy", 450 t.string(), 451 ) 452 } 453 return errList 454 } 455 456 // clusterToMachineHealthCheck maps events from Cluster objects to 457 // MachineHealthCheck objects that belong to the Cluster. 458 func (r *Reconciler) clusterToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request { 459 c, ok := o.(*clusterv1.Cluster) 460 if !ok { 461 panic(fmt.Sprintf("Expected a Cluster, got %T", o)) 462 } 463 464 mhcList := &clusterv1.MachineHealthCheckList{} 465 if err := r.Client.List( 466 ctx, 467 mhcList, 468 client.InNamespace(c.Namespace), 469 client.MatchingLabels{clusterv1.ClusterNameLabel: c.Name}, 470 ); err != nil { 471 return nil 472 } 473 474 // This list should only contain MachineHealthChecks which belong to the given Cluster 475 requests := []reconcile.Request{} 476 for _, mhc := range mhcList.Items { 477 key := types.NamespacedName{Namespace: mhc.Namespace, Name: mhc.Name} 478 requests = append(requests, reconcile.Request{NamespacedName: key}) 479 } 480 return requests 481 } 482 483 // machineToMachineHealthCheck maps events from Machine objects to 484 // MachineHealthCheck objects that monitor the given machine. 485 func (r *Reconciler) machineToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request { 486 m, ok := o.(*clusterv1.Machine) 487 if !ok { 488 panic(fmt.Sprintf("Expected a Machine, got %T", o)) 489 } 490 491 mhcList := &clusterv1.MachineHealthCheckList{} 492 if err := r.Client.List( 493 ctx, 494 mhcList, 495 client.InNamespace(m.Namespace), 496 client.MatchingLabels{clusterv1.ClusterNameLabel: m.Spec.ClusterName}, 497 ); err != nil { 498 return nil 499 } 500 501 var requests []reconcile.Request 502 for k := range mhcList.Items { 503 mhc := &mhcList.Items[k] 504 if machine.HasMatchingLabels(mhc.Spec.Selector, m.Labels) { 505 key := util.ObjectKey(mhc) 506 requests = append(requests, reconcile.Request{NamespacedName: key}) 507 } 508 } 509 return requests 510 } 511 512 func (r *Reconciler) nodeToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request { 513 node, ok := o.(*corev1.Node) 514 if !ok { 515 panic(fmt.Sprintf("Expected a corev1.Node, got %T", o)) 516 } 517 518 machine, err := getMachineFromNode(ctx, r.Client, node.Name) 519 if machine == nil || err != nil { 520 return nil 521 } 522 523 return r.machineToMachineHealthCheck(ctx, machine) 524 } 525 526 func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.Cluster) error { 527 // If there is no tracker, don't watch remote nodes 528 if r.Tracker == nil { 529 return nil 530 } 531 532 return r.Tracker.Watch(ctx, remote.WatchInput{ 533 Name: "machinehealthcheck-watchClusterNodes", 534 Cluster: util.ObjectKey(cluster), 535 Watcher: r.controller, 536 Kind: &corev1.Node{}, 537 EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachineHealthCheck), 538 }) 539 } 540 541 // getMachineFromNode retrieves the machine with a nodeRef to nodeName 542 // There should at most one machine with a given nodeRef, returns an error otherwise. 543 func getMachineFromNode(ctx context.Context, c client.Client, nodeName string) (*clusterv1.Machine, error) { 544 machineList := &clusterv1.MachineList{} 545 if err := c.List( 546 ctx, 547 machineList, 548 client.MatchingFields{index.MachineNodeNameField: nodeName}, 549 ); err != nil { 550 return nil, errors.Wrap(err, "failed getting machine list") 551 } 552 // TODO(vincepri): Remove this loop once controller runtime fake client supports 553 // adding indexes on objects. 554 items := []*clusterv1.Machine{} 555 for i := range machineList.Items { 556 machine := &machineList.Items[i] 557 if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == nodeName { 558 items = append(items, machine) 559 } 560 } 561 if len(items) != 1 { 562 return nil, errors.Errorf("expecting one machine for node %v, got %v", nodeName, machineNames(items)) 563 } 564 return items[0], nil 565 } 566 567 func machineNames(machines []*clusterv1.Machine) []string { 568 result := make([]string, 0, len(machines)) 569 for _, m := range machines { 570 result = append(result, m.Name) 571 } 572 return result 573 } 574 575 // isAllowedRemediation checks the value of the MaxUnhealthy field to determine 576 // returns whether remediation should be allowed or not, the remediation count, and error if any. 577 func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) { 578 var remediationAllowed bool 579 var remediationCount int32 580 if mhc.Spec.UnhealthyRange != nil { 581 min, max, err := getUnhealthyRange(mhc) 582 if err != nil { 583 return false, 0, err 584 } 585 unhealthyMachineCount := unhealthyMachineCount(mhc) 586 remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max 587 remediationCount = int32(max - unhealthyMachineCount) 588 return remediationAllowed, remediationCount, nil 589 } 590 591 maxUnhealthy, err := getMaxUnhealthy(mhc) 592 if err != nil { 593 return false, 0, err 594 } 595 596 // Remediation is not allowed if unhealthy is above maxUnhealthy 597 unhealthyMachineCount := unhealthyMachineCount(mhc) 598 remediationAllowed = unhealthyMachineCount <= maxUnhealthy 599 remediationCount = int32(maxUnhealthy - unhealthyMachineCount) 600 return remediationAllowed, remediationCount, nil 601 } 602 603 // getUnhealthyRange parses an integer range and returns the min and max values 604 // Eg. [2-5] will return (2,5,nil). 605 func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) { 606 // remove '[' and ']' 607 unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1] 608 609 parts := strings.Split(unhealthyRange, "-") 610 611 min, err := strconv.ParseUint(parts[0], 10, 32) 612 if err != nil { 613 return 0, 0, err 614 } 615 616 max, err := strconv.ParseUint(parts[1], 10, 32) 617 if err != nil { 618 return 0, 0, err 619 } 620 621 if max < min { 622 return 0, 0, errors.Errorf("max value %d cannot be less than min value %d for unhealthyRange", max, min) 623 } 624 625 return int(min), int(max), nil 626 } 627 628 func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) { 629 if mhc.Spec.MaxUnhealthy == nil { 630 return 0, errors.New("spec.maxUnhealthy must be set") 631 } 632 maxUnhealthy, err := intstr.GetScaledValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, int(mhc.Status.ExpectedMachines), false) 633 if err != nil { 634 return 0, err 635 } 636 return maxUnhealthy, nil 637 } 638 639 // unhealthyMachineCount calculates the number of presently unhealthy or missing machines 640 // ie the delta between the expected number of machines and the current number deemed healthy. 641 func unhealthyMachineCount(mhc *clusterv1.MachineHealthCheck) int { 642 return int(mhc.Status.ExpectedMachines - mhc.Status.CurrentHealthy) 643 } 644 645 // getExternalRemediationRequest gets reference to External Remediation Request, unstructured object. 646 func (r *Reconciler) getExternalRemediationRequest(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) (*unstructured.Unstructured, error) { 647 remediationRef := &corev1.ObjectReference{ 648 APIVersion: m.Spec.RemediationTemplate.APIVersion, 649 Kind: strings.TrimSuffix(m.Spec.RemediationTemplate.Kind, clusterv1.TemplateSuffix), 650 Name: machineName, 651 } 652 remediationReq, err := external.Get(ctx, r.Client, remediationRef, m.Namespace) 653 if err != nil { 654 return nil, errors.Wrapf(err, "failed to retrieve external remediation request object") 655 } 656 return remediationReq, nil 657 } 658 659 // externalRemediationRequestExists checks if the External Remediation Request is created 660 // for the machine. 661 func (r *Reconciler) externalRemediationRequestExists(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) bool { 662 remediationReq, err := r.getExternalRemediationRequest(ctx, m, machineName) 663 if err != nil { 664 return false 665 } 666 return remediationReq != nil 667 }