k8s.io/kubernetes@v1.29.3/pkg/controller/volume/attachdetach/reconciler/reconciler.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package reconciler implements interfaces that attempt to reconcile the
    18  // desired state of the with the actual state of the world by triggering
    19  // actions.
    20  package reconciler
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"strings"
    26  	"time"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/apimachinery/pkg/util/wait"
    31  	corelisters "k8s.io/client-go/listers/core/v1"
    32  	"k8s.io/client-go/tools/record"
    33  	"k8s.io/klog/v2"
    34  	"k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache"
    35  	"k8s.io/kubernetes/pkg/controller/volume/attachdetach/metrics"
    36  	"k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater"
    37  	kevents "k8s.io/kubernetes/pkg/kubelet/events"
    38  	"k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff"
    39  	nodeutil "k8s.io/kubernetes/pkg/util/node"
    40  	"k8s.io/kubernetes/pkg/util/taints"
    41  	"k8s.io/kubernetes/pkg/volume/util"
    42  	"k8s.io/kubernetes/pkg/volume/util/operationexecutor"
    43  )
    44  
    45  // Reconciler runs a periodic loop to reconcile the desired state of the world with
    46  // the actual state of the world by triggering attach detach operations.
    47  // Note: This is distinct from the Reconciler implemented by the kubelet volume
    48  // manager. This reconciles state for the attach/detach controller. That
    49  // reconciles state for the kubelet volume manager.
    50  type Reconciler interface {
    51  	// Starts running the reconciliation loop which executes periodically, checks
    52  	// if volumes that should be attached are attached and volumes that should
    53  	// be detached are detached. If not, it will trigger attach/detach
    54  	// operations to rectify.
    55  	Run(ctx context.Context)
    56  }
    57  
    58  // NewReconciler returns a new instance of Reconciler that waits loopPeriod
    59  // between successive executions.
    60  // loopPeriod is the amount of time the reconciler loop waits between
    61  // successive executions.
    62  // maxWaitForUnmountDuration is the max amount of time the reconciler will wait
    63  // for the volume to be safely unmounted, after this it will detach the volume
    64  // anyway (to handle crashed/unavailable nodes). If during this time the volume
    65  // becomes used by a new pod, the detach request will be aborted and the timer
    66  // cleared.
    67  func NewReconciler(
    68  	loopPeriod time.Duration,
    69  	maxWaitForUnmountDuration time.Duration,
    70  	syncDuration time.Duration,
    71  	disableReconciliationSync bool,
    72  	desiredStateOfWorld cache.DesiredStateOfWorld,
    73  	actualStateOfWorld cache.ActualStateOfWorld,
    74  	attacherDetacher operationexecutor.OperationExecutor,
    75  	nodeStatusUpdater statusupdater.NodeStatusUpdater,
    76  	nodeLister corelisters.NodeLister,
    77  	recorder record.EventRecorder) Reconciler {
    78  	return &reconciler{
    79  		loopPeriod:                loopPeriod,
    80  		maxWaitForUnmountDuration: maxWaitForUnmountDuration,
    81  		syncDuration:              syncDuration,
    82  		disableReconciliationSync: disableReconciliationSync,
    83  		desiredStateOfWorld:       desiredStateOfWorld,
    84  		actualStateOfWorld:        actualStateOfWorld,
    85  		attacherDetacher:          attacherDetacher,
    86  		nodeStatusUpdater:         nodeStatusUpdater,
    87  		nodeLister:                nodeLister,
    88  		timeOfLastSync:            time.Now(),
    89  		recorder:                  recorder,
    90  	}
    91  }
    92  
    93  type reconciler struct {
    94  	loopPeriod                time.Duration
    95  	maxWaitForUnmountDuration time.Duration
    96  	syncDuration              time.Duration
    97  	desiredStateOfWorld       cache.DesiredStateOfWorld
    98  	actualStateOfWorld        cache.ActualStateOfWorld
    99  	attacherDetacher          operationexecutor.OperationExecutor
   100  	nodeStatusUpdater         statusupdater.NodeStatusUpdater
   101  	nodeLister                corelisters.NodeLister
   102  	timeOfLastSync            time.Time
   103  	disableReconciliationSync bool
   104  	recorder                  record.EventRecorder
   105  }
   106  
   107  func (rc *reconciler) Run(ctx context.Context) {
   108  	wait.UntilWithContext(ctx, rc.reconciliationLoopFunc(ctx), rc.loopPeriod)
   109  }
   110  
   111  // reconciliationLoopFunc this can be disabled via cli option disableReconciliation.
   112  // It periodically checks whether the attached volumes from actual state
   113  // are still attached to the node and update the status if they are not.
   114  func (rc *reconciler) reconciliationLoopFunc(ctx context.Context) func(context.Context) {
   115  	return func(ctx context.Context) {
   116  
   117  		rc.reconcile(ctx)
   118  		logger := klog.FromContext(ctx)
   119  		if rc.disableReconciliationSync {
   120  			logger.V(5).Info("Skipping reconciling attached volumes still attached since it is disabled via the command line")
   121  		} else if rc.syncDuration < time.Second {
   122  			logger.V(5).Info("Skipping reconciling attached volumes still attached since it is set to less than one second via the command line")
   123  		} else if time.Since(rc.timeOfLastSync) > rc.syncDuration {
   124  			logger.V(5).Info("Starting reconciling attached volumes still attached")
   125  			rc.sync()
   126  		}
   127  	}
   128  }
   129  
   130  func (rc *reconciler) sync() {
   131  	defer rc.updateSyncTime()
   132  	rc.syncStates()
   133  }
   134  
   135  func (rc *reconciler) updateSyncTime() {
   136  	rc.timeOfLastSync = time.Now()
   137  }
   138  
   139  func (rc *reconciler) syncStates() {
   140  	volumesPerNode := rc.actualStateOfWorld.GetAttachedVolumesPerNode()
   141  	rc.attacherDetacher.VerifyVolumesAreAttached(volumesPerNode, rc.actualStateOfWorld)
   142  }
   143  
   144  // hasOutOfServiceTaint returns true if the node has out-of-service taint present.
   145  func (rc *reconciler) hasOutOfServiceTaint(nodeName types.NodeName) (bool, error) {
   146  	node, err := rc.nodeLister.Get(string(nodeName))
   147  	if err != nil {
   148  		return false, err
   149  	}
   150  	return taints.TaintKeyExists(node.Spec.Taints, v1.TaintNodeOutOfService), nil
   151  }
   152  
   153  // nodeIsHealthy returns true if the node looks healthy.
   154  func (rc *reconciler) nodeIsHealthy(nodeName types.NodeName) (bool, error) {
   155  	node, err := rc.nodeLister.Get(string(nodeName))
   156  	if err != nil {
   157  		return false, err
   158  	}
   159  	return nodeutil.IsNodeReady(node), nil
   160  }
   161  
   162  func (rc *reconciler) reconcile(ctx context.Context) {
   163  	// Detaches are triggered before attaches so that volumes referenced by
   164  	// pods that are rescheduled to a different node are detached first.
   165  
   166  	// Ensure volumes that should be detached are detached.
   167  	logger := klog.FromContext(ctx)
   168  	for _, attachedVolume := range rc.actualStateOfWorld.GetAttachedVolumes() {
   169  		if !rc.desiredStateOfWorld.VolumeExists(
   170  			attachedVolume.VolumeName, attachedVolume.NodeName) {
   171  
   172  			// Check whether there already exist an operation pending, and don't even
   173  			// try to start an operation if there is already one running.
   174  			// This check must be done before we do any other checks, as otherwise the other checks
   175  			// may pass while at the same time the volume leaves the pending state, resulting in
   176  			// double detach attempts
   177  			// The operation key format is different depending on whether the volume
   178  			// allows multi attach across different nodes.
   179  			if util.IsMultiAttachAllowed(attachedVolume.VolumeSpec) {
   180  				if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, attachedVolume.NodeName, operationexecutor.DetachOperationName) {
   181  					logger.V(10).Info("Operation for volume is already running or still in exponential backoff for node. Can't start detach", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   182  					continue
   183  				}
   184  			} else {
   185  				if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, "" /* nodeName */, operationexecutor.DetachOperationName) {
   186  					logger.V(10).Info("Operation for volume is already running or still in exponential backoff in the cluster. Can't start detach for node", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   187  					continue
   188  				}
   189  			}
   190  
   191  			// Because the detach operation updates the ActualStateOfWorld before
   192  			// marking itself complete, it's possible for the volume to be removed
   193  			// from the ActualStateOfWorld between the GetAttachedVolumes() check
   194  			// and the IsOperationPending() check above.
   195  			// Check the ActualStateOfWorld again to avoid issuing an unnecessary
   196  			// detach.
   197  			// See https://github.com/kubernetes/kubernetes/issues/93902
   198  			attachState := rc.actualStateOfWorld.GetAttachState(attachedVolume.VolumeName, attachedVolume.NodeName)
   199  			if attachState == cache.AttachStateDetached {
   200  				logger.V(5).Info("Volume detached--skipping", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   201  				continue
   202  			}
   203  
   204  			// Set the detach request time
   205  			elapsedTime, err := rc.actualStateOfWorld.SetDetachRequestTime(logger, attachedVolume.VolumeName, attachedVolume.NodeName)
   206  			if err != nil {
   207  				logger.Error(err, "Cannot trigger detach because it fails to set detach request time with error")
   208  				continue
   209  			}
   210  			// Check whether timeout has reached the maximum waiting time
   211  			timeout := elapsedTime > rc.maxWaitForUnmountDuration
   212  
   213  			isHealthy, err := rc.nodeIsHealthy(attachedVolume.NodeName)
   214  			if err != nil {
   215  				logger.Error(err, "Failed to get health of node", "node", klog.KRef("", string(attachedVolume.NodeName)))
   216  			}
   217  
   218  			// Force detach volumes from unhealthy nodes after maxWaitForUnmountDuration.
   219  			forceDetach := !isHealthy && timeout
   220  
   221  			hasOutOfServiceTaint, err := rc.hasOutOfServiceTaint(attachedVolume.NodeName)
   222  			if err != nil {
   223  				logger.Error(err, "Failed to get taint specs for node", "node", klog.KRef("", string(attachedVolume.NodeName)))
   224  			}
   225  
   226  			// Check whether volume is still mounted. Skip detach if it is still mounted unless force detach timeout
   227  			// or the node has `node.kubernetes.io/out-of-service` taint.
   228  			if attachedVolume.MountedByNode && !forceDetach && !hasOutOfServiceTaint {
   229  				logger.V(5).Info("Cannot detach volume because it is still mounted", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   230  				continue
   231  			}
   232  
   233  			// Before triggering volume detach, mark volume as detached and update the node status
   234  			// If it fails to update node status, skip detach volume
   235  			// If volume detach operation fails, the volume needs to be added back to report as attached so that node status
   236  			// has the correct volume attachment information.
   237  			err = rc.actualStateOfWorld.RemoveVolumeFromReportAsAttached(attachedVolume.VolumeName, attachedVolume.NodeName)
   238  			if err != nil {
   239  				logger.V(5).Info("RemoveVolumeFromReportAsAttached failed while removing volume from node",
   240  					"node", klog.KRef("", string(attachedVolume.NodeName)),
   241  					"volumeName", attachedVolume.VolumeName,
   242  					"err", err)
   243  			}
   244  
   245  			// Update Node Status to indicate volume is no longer safe to mount.
   246  			err = rc.nodeStatusUpdater.UpdateNodeStatusForNode(logger, attachedVolume.NodeName)
   247  			if err != nil {
   248  				// Skip detaching this volume if unable to update node status
   249  				logger.Error(err, "UpdateNodeStatusForNode failed while attempting to report volume as attached", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   250  				// Add volume back to ReportAsAttached if UpdateNodeStatusForNode call failed so that node status updater will add it back to VolumeAttached list.
   251  				// It is needed here too because DetachVolume is not call actually and we keep the data consistency for every reconcile.
   252  				rc.actualStateOfWorld.AddVolumeToReportAsAttached(logger, attachedVolume.VolumeName, attachedVolume.NodeName)
   253  				continue
   254  			}
   255  
   256  			// Trigger detach volume which requires verifying safe to detach step
   257  			// If timeout is true, skip verifySafeToDetach check
   258  			// If the node has node.kubernetes.io/out-of-service taint with NoExecute effect, skip verifySafeToDetach check
   259  			logger.V(5).Info("Starting attacherDetacher.DetachVolume", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   260  			if hasOutOfServiceTaint {
   261  				logger.V(4).Info("node has out-of-service taint", "node", klog.KRef("", string(attachedVolume.NodeName)))
   262  			}
   263  			verifySafeToDetach := !(timeout || hasOutOfServiceTaint)
   264  			err = rc.attacherDetacher.DetachVolume(logger, attachedVolume.AttachedVolume, verifySafeToDetach, rc.actualStateOfWorld)
   265  			if err == nil {
   266  				if verifySafeToDetach { // normal detach
   267  					logger.Info("attacherDetacher.DetachVolume started", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   268  				} else { // force detach
   269  					if timeout {
   270  						metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonTimeout)
   271  						logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching",
   272  							"duration", rc.maxWaitForUnmountDuration,
   273  							"node", klog.KRef("", string(attachedVolume.NodeName)),
   274  							"volumeName", attachedVolume.VolumeName)
   275  					} else {
   276  						metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonOutOfService)
   277  						logger.Info("attacherDetacher.DetachVolume started: node has out-of-service taint, force detaching",
   278  							"node", klog.KRef("", string(attachedVolume.NodeName)),
   279  							"volumeName", attachedVolume.VolumeName)
   280  					}
   281  				}
   282  			}
   283  			if err != nil {
   284  				// Add volume back to ReportAsAttached if DetachVolume call failed so that node status updater will add it back to VolumeAttached list.
   285  				// This function is also called during executing the volume detach operation in operation_generoator.
   286  				// It is needed here too because DetachVolume call might fail before executing the actual operation in operation_executor (e.g., cannot find volume plugin etc.)
   287  				rc.actualStateOfWorld.AddVolumeToReportAsAttached(logger, attachedVolume.VolumeName, attachedVolume.NodeName)
   288  
   289  				if !exponentialbackoff.IsExponentialBackoff(err) {
   290  					// Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected.
   291  					// Log all other errors.
   292  					logger.Error(err, "attacherDetacher.DetachVolume failed to start", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
   293  				}
   294  			}
   295  		}
   296  	}
   297  
   298  	rc.attachDesiredVolumes(logger)
   299  
   300  	// Update Node Status
   301  	err := rc.nodeStatusUpdater.UpdateNodeStatuses(logger)
   302  	if err != nil {
   303  		logger.Info("UpdateNodeStatuses failed", "err", err)
   304  	}
   305  }
   306  
   307  func (rc *reconciler) attachDesiredVolumes(logger klog.Logger) {
   308  	// Ensure volumes that should be attached are attached.
   309  	for _, volumeToAttach := range rc.desiredStateOfWorld.GetVolumesToAttach() {
   310  		if util.IsMultiAttachAllowed(volumeToAttach.VolumeSpec) {
   311  			// Don't even try to start an operation if there is already one running for the given volume and node.
   312  			if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "" /* podName */, volumeToAttach.NodeName) {
   313  				logger.V(10).Info("Operation for volume is already running for node. Can't start attach", "node", klog.KRef("", string(volumeToAttach.NodeName)), "volumeName", volumeToAttach.VolumeName)
   314  				continue
   315  			}
   316  		} else {
   317  			// Don't even try to start an operation if there is already one running for the given volume
   318  			if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "" /* podName */, "" /* nodeName */) {
   319  				logger.V(10).Info("Operation for volume is already running. Can't start attach for node", "node", klog.KRef("", string(volumeToAttach.NodeName)), "volumeNames", volumeToAttach.VolumeName)
   320  				continue
   321  			}
   322  		}
   323  
   324  		// Because the attach operation updates the ActualStateOfWorld before
   325  		// marking itself complete, IsOperationPending() must be checked before
   326  		// GetAttachState() to guarantee the ActualStateOfWorld is
   327  		// up-to-date when it's read.
   328  		// See https://github.com/kubernetes/kubernetes/issues/93902
   329  		attachState := rc.actualStateOfWorld.GetAttachState(volumeToAttach.VolumeName, volumeToAttach.NodeName)
   330  		if attachState == cache.AttachStateAttached {
   331  			// Volume/Node exists, touch it to reset detachRequestedTime
   332  			logger.V(10).Info("Volume attached--touching", "volume", volumeToAttach)
   333  			rc.actualStateOfWorld.ResetDetachRequestTime(logger, volumeToAttach.VolumeName, volumeToAttach.NodeName)
   334  			continue
   335  		}
   336  
   337  		if !util.IsMultiAttachAllowed(volumeToAttach.VolumeSpec) {
   338  			nodes := rc.actualStateOfWorld.GetNodesForAttachedVolume(volumeToAttach.VolumeName)
   339  			if len(nodes) > 0 {
   340  				if !volumeToAttach.MultiAttachErrorReported {
   341  					rc.reportMultiAttachError(logger, volumeToAttach, nodes)
   342  					rc.desiredStateOfWorld.SetMultiAttachError(volumeToAttach.VolumeName, volumeToAttach.NodeName)
   343  				}
   344  				continue
   345  			}
   346  		}
   347  
   348  		// Volume/Node doesn't exist, spawn a goroutine to attach it
   349  		logger.V(5).Info("Starting attacherDetacher.AttachVolume", "volume", volumeToAttach)
   350  		err := rc.attacherDetacher.AttachVolume(logger, volumeToAttach.VolumeToAttach, rc.actualStateOfWorld)
   351  		if err == nil {
   352  			logger.Info("attacherDetacher.AttachVolume started", "volumeName", volumeToAttach.VolumeName, "nodeName", volumeToAttach.NodeName, "scheduledPods", klog.KObjSlice(volumeToAttach.ScheduledPods))
   353  		}
   354  		if err != nil && !exponentialbackoff.IsExponentialBackoff(err) {
   355  			// Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected.
   356  			// Log all other errors.
   357  			logger.Error(err, "attacherDetacher.AttachVolume failed to start", "volumeName", volumeToAttach.VolumeName, "nodeName", volumeToAttach.NodeName, "scheduledPods", klog.KObjSlice(volumeToAttach.ScheduledPods))
   358  		}
   359  	}
   360  }
   361  
   362  // reportMultiAttachError sends events and logs situation that a volume that
   363  // should be attached to a node is already attached to different node(s).
   364  func (rc *reconciler) reportMultiAttachError(logger klog.Logger, volumeToAttach cache.VolumeToAttach, nodes []types.NodeName) {
   365  	// Filter out the current node from list of nodes where the volume is
   366  	// attached.
   367  	// Some methods need []string, some other needs []NodeName, collect both.
   368  	// In theory, these arrays should have always only one element - the
   369  	// controller does not allow more than one attachment. But use array just
   370  	// in case...
   371  	otherNodes := []types.NodeName{}
   372  	otherNodesStr := []string{}
   373  	for _, node := range nodes {
   374  		if node != volumeToAttach.NodeName {
   375  			otherNodes = append(otherNodes, node)
   376  			otherNodesStr = append(otherNodesStr, string(node))
   377  		}
   378  	}
   379  
   380  	// Get list of pods that use the volume on the other nodes.
   381  	pods := rc.desiredStateOfWorld.GetVolumePodsOnNodes(otherNodes, volumeToAttach.VolumeName)
   382  	if len(pods) == 0 {
   383  		// We did not find any pods that requests the volume. The pod must have been deleted already.
   384  		simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", "Volume is already exclusively attached to one node and can't be attached to another")
   385  		for _, pod := range volumeToAttach.ScheduledPods {
   386  			rc.recorder.Eventf(pod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg)
   387  		}
   388  		// Log detailed message to system admin
   389  		logger.Info("Multi-Attach error: volume is already exclusively attached and can't be attached to another node", "attachedTo", otherNodesStr, "volume", volumeToAttach)
   390  		return
   391  	}
   392  
   393  	// There are pods that require the volume and run on another node. Typically
   394  	// it's user error, e.g. a ReplicaSet uses a PVC and has >1 replicas. Let
   395  	// the user know what pods are blocking the volume.
   396  	for _, scheduledPod := range volumeToAttach.ScheduledPods {
   397  		// Each scheduledPod must get a custom message. They can run in
   398  		// different namespaces and user of a namespace should not see names of
   399  		// pods in other namespaces.
   400  		localPodNames := []string{} // Names of pods in scheduledPods's namespace
   401  		otherPods := 0              // Count of pods in other namespaces
   402  		for _, pod := range pods {
   403  			if pod.Namespace == scheduledPod.Namespace {
   404  				localPodNames = append(localPodNames, pod.Name)
   405  			} else {
   406  				otherPods++
   407  			}
   408  		}
   409  
   410  		var msg string
   411  		if len(localPodNames) > 0 {
   412  			msg = fmt.Sprintf("Volume is already used by pod(s) %s", strings.Join(localPodNames, ", "))
   413  			if otherPods > 0 {
   414  				msg = fmt.Sprintf("%s and %d pod(s) in different namespaces", msg, otherPods)
   415  			}
   416  		} else {
   417  			// No local pods, there are pods only in different namespaces.
   418  			msg = fmt.Sprintf("Volume is already used by %d pod(s) in different namespaces", otherPods)
   419  		}
   420  		simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", msg)
   421  		rc.recorder.Eventf(scheduledPod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg)
   422  	}
   423  
   424  	// Log all pods for system admin
   425  	logger.Info("Multi-Attach error: volume is already used by pods", "pods", klog.KObjSlice(pods), "attachedTo", otherNodesStr, "volume", volumeToAttach)
   426  }