sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinehealthcheck 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/go-logr/logr" 28 "github.com/pkg/errors" 29 corev1 "k8s.io/api/core/v1" 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 33 "k8s.io/apimachinery/pkg/types" 34 kerrors "k8s.io/apimachinery/pkg/util/errors" 35 "k8s.io/apimachinery/pkg/util/intstr" 36 "k8s.io/client-go/tools/record" 37 "k8s.io/klog/v2" 38 ctrl "sigs.k8s.io/controller-runtime" 39 "sigs.k8s.io/controller-runtime/pkg/builder" 40 "sigs.k8s.io/controller-runtime/pkg/client" 41 "sigs.k8s.io/controller-runtime/pkg/controller" 42 "sigs.k8s.io/controller-runtime/pkg/handler" 43 "sigs.k8s.io/controller-runtime/pkg/reconcile" 44 45 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 46 "sigs.k8s.io/cluster-api/api/v1beta1/index" 47 "sigs.k8s.io/cluster-api/controllers/external" 48 "sigs.k8s.io/cluster-api/controllers/remote" 49 "sigs.k8s.io/cluster-api/internal/controllers/machine" 50 "sigs.k8s.io/cluster-api/util" 51 "sigs.k8s.io/cluster-api/util/annotations" 52 "sigs.k8s.io/cluster-api/util/conditions" 53 "sigs.k8s.io/cluster-api/util/patch" 54 "sigs.k8s.io/cluster-api/util/predicates" 55 ) 56 57 const ( 58 // Event types. 59 60 // EventRemediationRestricted is emitted in case when machine remediation 61 // is restricted by remediation circuit shorting logic. 62 EventRemediationRestricted string = "RemediationRestricted" 63 64 maxUnhealthyKeyLog = "max unhealthy" 65 unhealthyTargetsKeyLog = "unhealthy targets" 66 unhealthyRangeKeyLog = "unhealthy range" 67 totalTargetKeyLog = "total target" 68 ) 69 70 // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch 71 // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch 72 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;delete 73 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinehealthchecks;machinehealthchecks/status;machinehealthchecks/finalizers,verbs=get;list;watch;update;patch 74 75 // Reconciler reconciles a MachineHealthCheck object. 76 type Reconciler struct { 77 Client client.Client 78 Tracker *remote.ClusterCacheTracker 79 80 // WatchFilterValue is the label value used to filter events prior to reconciliation. 81 WatchFilterValue string 82 83 controller controller.Controller 84 recorder record.EventRecorder 85 } 86 87 func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { 88 c, err := ctrl.NewControllerManagedBy(mgr). 89 For(&clusterv1.MachineHealthCheck{}). 90 Watches( 91 &clusterv1.Machine{}, 92 handler.EnqueueRequestsFromMapFunc(r.machineToMachineHealthCheck), 93 ). 94 WithOptions(options). 95 WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)). 96 Watches( 97 &clusterv1.Cluster{}, 98 handler.EnqueueRequestsFromMapFunc(r.clusterToMachineHealthCheck), 99 builder.WithPredicates( 100 // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? 101 predicates.All(ctrl.LoggerFrom(ctx), 102 predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)), 103 predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue), 104 ), 105 ), 106 ).Build(r) 107 if err != nil { 108 return errors.Wrap(err, "failed setting up with a controller manager") 109 } 110 111 r.controller = c 112 r.recorder = mgr.GetEventRecorderFor("machinehealthcheck-controller") 113 return nil 114 } 115 116 func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { 117 log := ctrl.LoggerFrom(ctx) 118 119 // Fetch the MachineHealthCheck instance 120 m := &clusterv1.MachineHealthCheck{} 121 if err := r.Client.Get(ctx, req.NamespacedName, m); err != nil { 122 if apierrors.IsNotFound(err) { 123 // Object not found, return. Created objects are automatically garbage collected. 124 // For additional cleanup logic use finalizers. 125 return ctrl.Result{}, nil 126 } 127 128 // Error reading the object - requeue the request. 129 log.Error(err, "Failed to fetch MachineHealthCheck") 130 return ctrl.Result{}, err 131 } 132 133 log = log.WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName)) 134 ctx = ctrl.LoggerInto(ctx, log) 135 136 cluster, err := util.GetClusterByName(ctx, r.Client, m.Namespace, m.Spec.ClusterName) 137 if err != nil { 138 log.Error(err, "Failed to fetch Cluster for MachineHealthCheck") 139 return ctrl.Result{}, err 140 } 141 142 // Return early if the object or Cluster is paused. 143 if annotations.IsPaused(cluster, m) { 144 log.Info("Reconciliation is paused for this object") 145 return ctrl.Result{}, nil 146 } 147 148 // Initialize the patch helper 149 patchHelper, err := patch.NewHelper(m, r.Client) 150 if err != nil { 151 log.Error(err, "Failed to build patch helper") 152 return ctrl.Result{}, err 153 } 154 155 defer func() { 156 // Always attempt to patch the object and status after each reconciliation. 157 // Patch ObservedGeneration only if the reconciliation completed successfully 158 patchOpts := []patch.Option{} 159 if reterr == nil { 160 patchOpts = append(patchOpts, patch.WithStatusObservedGeneration{}) 161 } 162 if err := patchHelper.Patch(ctx, m, patchOpts...); err != nil { 163 reterr = kerrors.NewAggregate([]error{reterr, err}) 164 } 165 }() 166 167 // Reconcile labels. 168 if m.Labels == nil { 169 m.Labels = make(map[string]string) 170 } 171 m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName 172 173 result, err := r.reconcile(ctx, log, cluster, m) 174 if err != nil { 175 // Requeue if the reconcile failed because the ClusterCacheTracker was locked for 176 // the current cluster because of concurrent access. 177 if errors.Is(err, remote.ErrClusterLocked) { 178 log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") 179 return ctrl.Result{Requeue: true}, nil 180 } 181 log.Error(err, "Failed to reconcile MachineHealthCheck") 182 r.recorder.Eventf(m, corev1.EventTypeWarning, "ReconcileError", "%v", err) 183 184 // Requeue immediately if any errors occurred 185 return ctrl.Result{}, err 186 } 187 188 return result, nil 189 } 190 191 func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) { 192 // Ensure the MachineHealthCheck is owned by the Cluster it belongs to 193 m.SetOwnerReferences(util.EnsureOwnerRef(m.GetOwnerReferences(), metav1.OwnerReference{ 194 APIVersion: clusterv1.GroupVersion.String(), 195 Kind: "Cluster", 196 Name: cluster.Name, 197 UID: cluster.UID, 198 })) 199 200 // If the cluster is already initialized, get the remote cluster cache to use as a client.Reader. 201 var remoteClient client.Client 202 if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { 203 var err error 204 remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster)) 205 if err != nil { 206 logger.Error(err, "error creating remote cluster cache") 207 return ctrl.Result{}, err 208 } 209 210 if err := r.watchClusterNodes(ctx, cluster); err != nil { 211 return ctrl.Result{}, err 212 } 213 } 214 215 // fetch all targets 216 logger.V(3).Info("Finding targets") 217 targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, cluster, m) 218 if err != nil { 219 logger.Error(err, "Failed to fetch targets from MachineHealthCheck") 220 return ctrl.Result{}, err 221 } 222 totalTargets := len(targets) 223 m.Status.ExpectedMachines = int32(totalTargets) 224 m.Status.Targets = make([]string, totalTargets) 225 for i, t := range targets { 226 m.Status.Targets[i] = t.Machine.Name 227 } 228 // do sort to avoid keep changing m.Status as the returned machines are not in order 229 sort.Strings(m.Status.Targets) 230 231 nodeStartupTimeout := m.Spec.NodeStartupTimeout 232 if nodeStartupTimeout == nil { 233 nodeStartupTimeout = &clusterv1.DefaultNodeStartupTimeout 234 } 235 236 // health check all targets and reconcile mhc status 237 healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, *nodeStartupTimeout) 238 m.Status.CurrentHealthy = int32(len(healthy)) 239 240 // check MHC current health against MaxUnhealthy 241 remediationAllowed, remediationCount, err := isAllowedRemediation(m) 242 if err != nil { 243 return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed") 244 } 245 246 if !remediationAllowed { 247 var message string 248 249 if m.Spec.UnhealthyRange == nil { 250 logger.V(3).Info( 251 "Short-circuiting remediation", 252 totalTargetKeyLog, totalTargets, 253 maxUnhealthyKeyLog, m.Spec.MaxUnhealthy, 254 unhealthyTargetsKeyLog, len(unhealthy), 255 ) 256 message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)", 257 totalTargets, 258 len(unhealthy), 259 m.Spec.MaxUnhealthy) 260 } else { 261 logger.V(3).Info( 262 "Short-circuiting remediation", 263 totalTargetKeyLog, totalTargets, 264 unhealthyRangeKeyLog, *m.Spec.UnhealthyRange, 265 unhealthyTargetsKeyLog, len(unhealthy), 266 ) 267 message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)", 268 totalTargets, 269 len(unhealthy), 270 *m.Spec.UnhealthyRange) 271 } 272 273 // Remediation not allowed, the number of not started or unhealthy machines either exceeds maxUnhealthy (or) not within unhealthyRange 274 m.Status.RemediationsAllowed = 0 275 conditions.Set(m, &clusterv1.Condition{ 276 Type: clusterv1.RemediationAllowedCondition, 277 Status: corev1.ConditionFalse, 278 Severity: clusterv1.ConditionSeverityWarning, 279 Reason: clusterv1.TooManyUnhealthyReason, 280 Message: message, 281 }) 282 283 r.recorder.Event( 284 m, 285 corev1.EventTypeWarning, 286 EventRemediationRestricted, 287 message, 288 ) 289 errList := []error{} 290 for _, t := range append(healthy, unhealthy...) { 291 if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { 292 errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) 293 continue 294 } 295 } 296 if len(errList) > 0 { 297 return ctrl.Result{}, kerrors.NewAggregate(errList) 298 } 299 return reconcile.Result{Requeue: true}, nil 300 } 301 302 if m.Spec.UnhealthyRange == nil { 303 logger.V(3).Info( 304 "Remediations are allowed", 305 totalTargetKeyLog, totalTargets, 306 maxUnhealthyKeyLog, m.Spec.MaxUnhealthy, 307 unhealthyTargetsKeyLog, len(unhealthy), 308 ) 309 } else { 310 logger.V(3).Info( 311 "Remediations are allowed", 312 totalTargetKeyLog, totalTargets, 313 unhealthyRangeKeyLog, *m.Spec.UnhealthyRange, 314 unhealthyTargetsKeyLog, len(unhealthy), 315 ) 316 } 317 318 // Remediation is allowed so unhealthyMachineCount is within unhealthyRange (or) maxUnhealthy - unhealthyMachineCount >= 0 319 m.Status.RemediationsAllowed = remediationCount 320 conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition) 321 322 errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m) 323 errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...) 324 325 // handle update errors 326 if len(errList) > 0 { 327 logger.V(3).Info("Error(s) marking machine, requeuing") 328 return reconcile.Result{}, kerrors.NewAggregate(errList) 329 } 330 331 if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 { 332 logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String()) 333 return ctrl.Result{RequeueAfter: minNextCheck}, nil 334 } 335 336 logger.V(3).Info("No more targets meet unhealthy criteria") 337 338 return ctrl.Result{}, nil 339 } 340 341 // patchHealthyTargets patches healthy machines with MachineHealthCheckSucceededCondition. 342 func (r *Reconciler) patchHealthyTargets(ctx context.Context, logger logr.Logger, healthy []healthCheckTarget, m *clusterv1.MachineHealthCheck) []error { 343 errList := []error{} 344 for _, t := range healthy { 345 if m.Spec.RemediationTemplate != nil { 346 // Get remediation request object 347 obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name) 348 if err != nil { 349 if !apierrors.IsNotFound(errors.Cause(err)) { 350 wrappedErr := errors.Wrapf(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName) 351 errList = append(errList, wrappedErr) 352 } 353 continue 354 } 355 // Check that obj has no DeletionTimestamp to avoid hot loop 356 if obj.GetDeletionTimestamp() == nil { 357 // Issue a delete for remediation request. 358 if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { 359 errList = append(errList, errors.Wrapf(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name)) 360 continue 361 } 362 } 363 } 364 365 if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { 366 logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName()) 367 errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) 368 } 369 } 370 return errList 371 } 372 373 // patchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation. 374 func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logger, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error { 375 // mark for remediation 376 errList := []error{} 377 for _, t := range unhealthy { 378 condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSucceededCondition) 379 380 if annotations.IsPaused(cluster, t.Machine) { 381 logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) 382 } else { 383 if m.Spec.RemediationTemplate != nil { 384 // If external remediation request already exists, 385 // return early 386 if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) { 387 return errList 388 } 389 390 cloneOwnerRef := &metav1.OwnerReference{ 391 APIVersion: clusterv1.GroupVersion.String(), 392 Kind: "Machine", 393 Name: t.Machine.Name, 394 UID: t.Machine.UID, 395 } 396 397 from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace) 398 if err != nil { 399 conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error()) 400 errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) 401 return errList 402 } 403 404 generateTemplateInput := &external.GenerateTemplateInput{ 405 Template: from, 406 TemplateRef: m.Spec.RemediationTemplate, 407 Namespace: t.Machine.Namespace, 408 ClusterName: t.Machine.Spec.ClusterName, 409 OwnerRef: cloneOwnerRef, 410 } 411 to, err := external.GenerateTemplate(generateTemplateInput) 412 if err != nil { 413 errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) 414 return errList 415 } 416 417 // Set the Remediation Request to match the Machine name, the name is used to 418 // guarantee uniqueness between runs. A Machine should only ever have a single 419 // remediation object of a specific GVK created. 420 // 421 // NOTE: This doesn't guarantee uniqueness across different MHC objects watching 422 // the same Machine, users are in charge of setting health checks and remediation properly. 423 to.SetName(t.Machine.Name) 424 425 logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message) 426 // Create the external clone. 427 if err := r.Client.Create(ctx, to); err != nil { 428 conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error()) 429 errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName)) 430 return errList 431 } 432 } else { 433 logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) 434 // NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed; 435 // instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition. 436 if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) { 437 conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") 438 } 439 } 440 } 441 442 if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { 443 errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) 444 continue 445 } 446 r.recorder.Eventf( 447 t.Machine, 448 corev1.EventTypeNormal, 449 EventMachineMarkedUnhealthy, 450 "Machine %v has been marked as unhealthy", 451 t.string(), 452 ) 453 } 454 return errList 455 } 456 457 // clusterToMachineHealthCheck maps events from Cluster objects to 458 // MachineHealthCheck objects that belong to the Cluster. 459 func (r *Reconciler) clusterToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request { 460 c, ok := o.(*clusterv1.Cluster) 461 if !ok { 462 panic(fmt.Sprintf("Expected a Cluster, got %T", o)) 463 } 464 465 mhcList := &clusterv1.MachineHealthCheckList{} 466 if err := r.Client.List( 467 ctx, 468 mhcList, 469 client.InNamespace(c.Namespace), 470 client.MatchingLabels{clusterv1.ClusterNameLabel: c.Name}, 471 ); err != nil { 472 return nil 473 } 474 475 // This list should only contain MachineHealthChecks which belong to the given Cluster 476 requests := []reconcile.Request{} 477 for _, mhc := range mhcList.Items { 478 key := types.NamespacedName{Namespace: mhc.Namespace, Name: mhc.Name} 479 requests = append(requests, reconcile.Request{NamespacedName: key}) 480 } 481 return requests 482 } 483 484 // machineToMachineHealthCheck maps events from Machine objects to 485 // MachineHealthCheck objects that monitor the given machine. 486 func (r *Reconciler) machineToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request { 487 m, ok := o.(*clusterv1.Machine) 488 if !ok { 489 panic(fmt.Sprintf("Expected a Machine, got %T", o)) 490 } 491 492 mhcList := &clusterv1.MachineHealthCheckList{} 493 if err := r.Client.List( 494 ctx, 495 mhcList, 496 client.InNamespace(m.Namespace), 497 client.MatchingLabels{clusterv1.ClusterNameLabel: m.Spec.ClusterName}, 498 ); err != nil { 499 return nil 500 } 501 502 var requests []reconcile.Request 503 for k := range mhcList.Items { 504 mhc := &mhcList.Items[k] 505 if machine.HasMatchingLabels(mhc.Spec.Selector, m.Labels) { 506 key := util.ObjectKey(mhc) 507 requests = append(requests, reconcile.Request{NamespacedName: key}) 508 } 509 } 510 return requests 511 } 512 513 func (r *Reconciler) nodeToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request { 514 node, ok := o.(*corev1.Node) 515 if !ok { 516 panic(fmt.Sprintf("Expected a corev1.Node, got %T", o)) 517 } 518 519 machine, err := getMachineFromNode(ctx, r.Client, node.Name) 520 if machine == nil || err != nil { 521 return nil 522 } 523 524 return r.machineToMachineHealthCheck(ctx, machine) 525 } 526 527 func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.Cluster) error { 528 // If there is no tracker, don't watch remote nodes 529 if r.Tracker == nil { 530 return nil 531 } 532 533 return r.Tracker.Watch(ctx, remote.WatchInput{ 534 Name: "machinehealthcheck-watchClusterNodes", 535 Cluster: util.ObjectKey(cluster), 536 Watcher: r.controller, 537 Kind: &corev1.Node{}, 538 EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachineHealthCheck), 539 }) 540 } 541 542 // getMachineFromNode retrieves the machine with a nodeRef to nodeName 543 // There should at most one machine with a given nodeRef, returns an error otherwise. 544 func getMachineFromNode(ctx context.Context, c client.Client, nodeName string) (*clusterv1.Machine, error) { 545 machineList := &clusterv1.MachineList{} 546 if err := c.List( 547 ctx, 548 machineList, 549 client.MatchingFields{index.MachineNodeNameField: nodeName}, 550 ); err != nil { 551 return nil, errors.Wrap(err, "failed getting machine list") 552 } 553 // TODO(vincepri): Remove this loop once controller runtime fake client supports 554 // adding indexes on objects. 555 items := []*clusterv1.Machine{} 556 for i := range machineList.Items { 557 machine := &machineList.Items[i] 558 if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == nodeName { 559 items = append(items, machine) 560 } 561 } 562 if len(items) != 1 { 563 return nil, errors.Errorf("expecting one machine for node %v, got %v", nodeName, machineNames(items)) 564 } 565 return items[0], nil 566 } 567 568 func machineNames(machines []*clusterv1.Machine) []string { 569 result := make([]string, 0, len(machines)) 570 for _, m := range machines { 571 result = append(result, m.Name) 572 } 573 return result 574 } 575 576 // isAllowedRemediation checks the value of the MaxUnhealthy field to determine 577 // returns whether remediation should be allowed or not, the remediation count, and error if any. 578 func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) { 579 var remediationAllowed bool 580 var remediationCount int32 581 if mhc.Spec.UnhealthyRange != nil { 582 min, max, err := getUnhealthyRange(mhc) 583 if err != nil { 584 return false, 0, err 585 } 586 unhealthyMachineCount := unhealthyMachineCount(mhc) 587 remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max 588 remediationCount = int32(max - unhealthyMachineCount) 589 return remediationAllowed, remediationCount, nil 590 } 591 592 maxUnhealthy, err := getMaxUnhealthy(mhc) 593 if err != nil { 594 return false, 0, err 595 } 596 597 // Remediation is not allowed if unhealthy is above maxUnhealthy 598 unhealthyMachineCount := unhealthyMachineCount(mhc) 599 remediationAllowed = unhealthyMachineCount <= maxUnhealthy 600 remediationCount = int32(maxUnhealthy - unhealthyMachineCount) 601 return remediationAllowed, remediationCount, nil 602 } 603 604 // getUnhealthyRange parses an integer range and returns the min and max values 605 // Eg. [2-5] will return (2,5,nil). 606 func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) { 607 // remove '[' and ']' 608 unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1] 609 610 parts := strings.Split(unhealthyRange, "-") 611 612 min, err := strconv.ParseUint(parts[0], 10, 32) 613 if err != nil { 614 return 0, 0, err 615 } 616 617 max, err := strconv.ParseUint(parts[1], 10, 32) 618 if err != nil { 619 return 0, 0, err 620 } 621 622 if max < min { 623 return 0, 0, errors.Errorf("max value %d cannot be less than min value %d for unhealthyRange", max, min) 624 } 625 626 return int(min), int(max), nil 627 } 628 629 func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) { 630 if mhc.Spec.MaxUnhealthy == nil { 631 return 0, errors.New("spec.maxUnhealthy must be set") 632 } 633 maxUnhealthy, err := intstr.GetScaledValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, int(mhc.Status.ExpectedMachines), false) 634 if err != nil { 635 return 0, err 636 } 637 return maxUnhealthy, nil 638 } 639 640 // unhealthyMachineCount calculates the number of presently unhealthy or missing machines 641 // ie the delta between the expected number of machines and the current number deemed healthy. 642 func unhealthyMachineCount(mhc *clusterv1.MachineHealthCheck) int { 643 return int(mhc.Status.ExpectedMachines - mhc.Status.CurrentHealthy) 644 } 645 646 // getExternalRemediationRequest gets reference to External Remediation Request, unstructured object. 647 func (r *Reconciler) getExternalRemediationRequest(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) (*unstructured.Unstructured, error) { 648 remediationRef := &corev1.ObjectReference{ 649 APIVersion: m.Spec.RemediationTemplate.APIVersion, 650 Kind: strings.TrimSuffix(m.Spec.RemediationTemplate.Kind, clusterv1.TemplateSuffix), 651 Name: machineName, 652 } 653 remediationReq, err := external.Get(ctx, r.Client, remediationRef, m.Namespace) 654 if err != nil { 655 return nil, errors.Wrapf(err, "failed to retrieve external remediation request object") 656 } 657 return remediationReq, nil 658 } 659 660 // externalRemediationRequestExists checks if the External Remediation Request is created 661 // for the machine. 662 func (r *Reconciler) externalRemediationRequestExists(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) bool { 663 remediationReq, err := r.getExternalRemediationRequest(ctx, m, machineName) 664 if err != nil { 665 return false 666 } 667 return remediationReq != nil 668 }