k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/volume/attachdetach/reconciler/reconciler.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package reconciler implements interfaces that attempt to reconcile the 18 // desired state of the with the actual state of the world by triggering 19 // actions. 20 package reconciler 21 22 import ( 23 "context" 24 "fmt" 25 "strings" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/apimachinery/pkg/util/wait" 31 corelisters "k8s.io/client-go/listers/core/v1" 32 "k8s.io/client-go/tools/record" 33 "k8s.io/klog/v2" 34 "k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache" 35 "k8s.io/kubernetes/pkg/controller/volume/attachdetach/metrics" 36 "k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater" 37 kevents "k8s.io/kubernetes/pkg/kubelet/events" 38 "k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff" 39 nodeutil "k8s.io/kubernetes/pkg/util/node" 40 "k8s.io/kubernetes/pkg/util/taints" 41 "k8s.io/kubernetes/pkg/volume/util" 42 "k8s.io/kubernetes/pkg/volume/util/operationexecutor" 43 ) 44 45 // Reconciler runs a periodic loop to reconcile the desired state of the world with 46 // the actual state of the world by triggering attach detach operations. 47 // Note: This is distinct from the Reconciler implemented by the kubelet volume 48 // manager. This reconciles state for the attach/detach controller. That 49 // reconciles state for the kubelet volume manager. 50 type Reconciler interface { 51 // Starts running the reconciliation loop which executes periodically, checks 52 // if volumes that should be attached are attached and volumes that should 53 // be detached are detached. If not, it will trigger attach/detach 54 // operations to rectify. 55 Run(ctx context.Context) 56 } 57 58 // NewReconciler returns a new instance of Reconciler that waits loopPeriod 59 // between successive executions. 60 // loopPeriod is the amount of time the reconciler loop waits between 61 // successive executions. 62 // maxWaitForUnmountDuration is the max amount of time the reconciler will wait 63 // for the volume to be safely unmounted, after this it will detach the volume 64 // anyway (to handle crashed/unavailable nodes). If during this time the volume 65 // becomes used by a new pod, the detach request will be aborted and the timer 66 // cleared. 67 func NewReconciler( 68 loopPeriod time.Duration, 69 maxWaitForUnmountDuration time.Duration, 70 syncDuration time.Duration, 71 disableReconciliationSync bool, 72 disableForceDetachOnTimeout bool, 73 desiredStateOfWorld cache.DesiredStateOfWorld, 74 actualStateOfWorld cache.ActualStateOfWorld, 75 attacherDetacher operationexecutor.OperationExecutor, 76 nodeStatusUpdater statusupdater.NodeStatusUpdater, 77 nodeLister corelisters.NodeLister, 78 recorder record.EventRecorder) Reconciler { 79 return &reconciler{ 80 loopPeriod: loopPeriod, 81 maxWaitForUnmountDuration: maxWaitForUnmountDuration, 82 syncDuration: syncDuration, 83 disableReconciliationSync: disableReconciliationSync, 84 disableForceDetachOnTimeout: disableForceDetachOnTimeout, 85 desiredStateOfWorld: desiredStateOfWorld, 86 actualStateOfWorld: actualStateOfWorld, 87 attacherDetacher: attacherDetacher, 88 nodeStatusUpdater: nodeStatusUpdater, 89 nodeLister: nodeLister, 90 timeOfLastSync: time.Now(), 91 recorder: recorder, 92 } 93 } 94 95 type reconciler struct { 96 loopPeriod time.Duration 97 maxWaitForUnmountDuration time.Duration 98 syncDuration time.Duration 99 desiredStateOfWorld cache.DesiredStateOfWorld 100 actualStateOfWorld cache.ActualStateOfWorld 101 attacherDetacher operationexecutor.OperationExecutor 102 nodeStatusUpdater statusupdater.NodeStatusUpdater 103 nodeLister corelisters.NodeLister 104 timeOfLastSync time.Time 105 disableReconciliationSync bool 106 disableForceDetachOnTimeout bool 107 recorder record.EventRecorder 108 } 109 110 func (rc *reconciler) Run(ctx context.Context) { 111 wait.UntilWithContext(ctx, rc.reconciliationLoopFunc(ctx), rc.loopPeriod) 112 } 113 114 // reconciliationLoopFunc this can be disabled via cli option disableReconciliation. 115 // It periodically checks whether the attached volumes from actual state 116 // are still attached to the node and update the status if they are not. 117 func (rc *reconciler) reconciliationLoopFunc(ctx context.Context) func(context.Context) { 118 return func(ctx context.Context) { 119 120 rc.reconcile(ctx) 121 logger := klog.FromContext(ctx) 122 if rc.disableReconciliationSync { 123 logger.V(5).Info("Skipping reconciling attached volumes still attached since it is disabled via the command line") 124 } else if rc.syncDuration < time.Second { 125 logger.V(5).Info("Skipping reconciling attached volumes still attached since it is set to less than one second via the command line") 126 } else if time.Since(rc.timeOfLastSync) > rc.syncDuration { 127 logger.V(5).Info("Starting reconciling attached volumes still attached") 128 rc.sync() 129 } 130 } 131 } 132 133 func (rc *reconciler) sync() { 134 defer rc.updateSyncTime() 135 rc.syncStates() 136 } 137 138 func (rc *reconciler) updateSyncTime() { 139 rc.timeOfLastSync = time.Now() 140 } 141 142 func (rc *reconciler) syncStates() { 143 volumesPerNode := rc.actualStateOfWorld.GetAttachedVolumesPerNode() 144 rc.attacherDetacher.VerifyVolumesAreAttached(volumesPerNode, rc.actualStateOfWorld) 145 } 146 147 // hasOutOfServiceTaint returns true if the node has out-of-service taint present. 148 func (rc *reconciler) hasOutOfServiceTaint(nodeName types.NodeName) (bool, error) { 149 node, err := rc.nodeLister.Get(string(nodeName)) 150 if err != nil { 151 return false, err 152 } 153 return taints.TaintKeyExists(node.Spec.Taints, v1.TaintNodeOutOfService), nil 154 } 155 156 // nodeIsHealthy returns true if the node looks healthy. 157 func (rc *reconciler) nodeIsHealthy(nodeName types.NodeName) (bool, error) { 158 node, err := rc.nodeLister.Get(string(nodeName)) 159 if err != nil { 160 return false, err 161 } 162 return nodeutil.IsNodeReady(node), nil 163 } 164 165 func (rc *reconciler) reconcile(ctx context.Context) { 166 // Detaches are triggered before attaches so that volumes referenced by 167 // pods that are rescheduled to a different node are detached first. 168 169 // Ensure volumes that should be detached are detached. 170 logger := klog.FromContext(ctx) 171 for _, attachedVolume := range rc.actualStateOfWorld.GetAttachedVolumes() { 172 if !rc.desiredStateOfWorld.VolumeExists( 173 attachedVolume.VolumeName, attachedVolume.NodeName) { 174 175 // Check whether there already exist an operation pending, and don't even 176 // try to start an operation if there is already one running. 177 // This check must be done before we do any other checks, as otherwise the other checks 178 // may pass while at the same time the volume leaves the pending state, resulting in 179 // double detach attempts 180 // The operation key format is different depending on whether the volume 181 // allows multi attach across different nodes. 182 if util.IsMultiAttachAllowed(attachedVolume.VolumeSpec) { 183 if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, attachedVolume.NodeName, operationexecutor.DetachOperationName) { 184 logger.V(10).Info("Operation for volume is already running or still in exponential backoff for node. Can't start detach", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 185 continue 186 } 187 } else { 188 if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, "" /* nodeName */, operationexecutor.DetachOperationName) { 189 logger.V(10).Info("Operation for volume is already running or still in exponential backoff in the cluster. Can't start detach for node", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 190 continue 191 } 192 } 193 194 // Because the detach operation updates the ActualStateOfWorld before 195 // marking itself complete, it's possible for the volume to be removed 196 // from the ActualStateOfWorld between the GetAttachedVolumes() check 197 // and the IsOperationSafeToRetry() check above. 198 // Check the ActualStateOfWorld again to avoid issuing an unnecessary 199 // detach. 200 // See https://github.com/kubernetes/kubernetes/issues/93902 201 attachState := rc.actualStateOfWorld.GetAttachState(attachedVolume.VolumeName, attachedVolume.NodeName) 202 if attachState == cache.AttachStateDetached { 203 logger.V(5).Info("Volume detached--skipping", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 204 continue 205 } 206 207 // Set the detach request time 208 elapsedTime, err := rc.actualStateOfWorld.SetDetachRequestTime(logger, attachedVolume.VolumeName, attachedVolume.NodeName) 209 if err != nil { 210 logger.Error(err, "Cannot trigger detach because it fails to set detach request time with error") 211 continue 212 } 213 // Check whether the umount drain timer expired 214 maxWaitForUnmountDurationExpired := elapsedTime > rc.maxWaitForUnmountDuration 215 216 isHealthy, err := rc.nodeIsHealthy(attachedVolume.NodeName) 217 if err != nil { 218 logger.Error(err, "Failed to get health of node", "node", klog.KRef("", string(attachedVolume.NodeName))) 219 } 220 221 // Force detach volumes from unhealthy nodes after maxWaitForUnmountDuration if force detach is enabled 222 // Ensure that the timeout condition checks this correctly so that the correct metric is updated below 223 forceDetatchTimeoutExpired := maxWaitForUnmountDurationExpired && !rc.disableForceDetachOnTimeout 224 if maxWaitForUnmountDurationExpired && rc.disableForceDetachOnTimeout { 225 logger.V(5).Info("Drain timeout expired for volume but disableForceDetachOnTimeout was set", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 226 } 227 forceDetach := !isHealthy && forceDetatchTimeoutExpired 228 229 hasOutOfServiceTaint, err := rc.hasOutOfServiceTaint(attachedVolume.NodeName) 230 if err != nil { 231 logger.Error(err, "Failed to get taint specs for node", "node", klog.KRef("", string(attachedVolume.NodeName))) 232 } 233 234 // Check whether volume is still mounted. Skip detach if it is still mounted unless we have 235 // decided to force detach or the node has `node.kubernetes.io/out-of-service` taint. 236 if attachedVolume.MountedByNode && !forceDetach && !hasOutOfServiceTaint { 237 logger.V(5).Info("Cannot detach volume because it is still mounted", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 238 continue 239 } 240 241 // Before triggering volume detach, mark volume as detached and update the node status 242 // If it fails to update node status, skip detach volume 243 // If volume detach operation fails, the volume needs to be added back to report as attached so that node status 244 // has the correct volume attachment information. 245 err = rc.actualStateOfWorld.RemoveVolumeFromReportAsAttached(attachedVolume.VolumeName, attachedVolume.NodeName) 246 if err != nil { 247 logger.V(5).Info("RemoveVolumeFromReportAsAttached failed while removing volume from node", 248 "node", klog.KRef("", string(attachedVolume.NodeName)), 249 "volumeName", attachedVolume.VolumeName, 250 "err", err) 251 } 252 253 // Update Node Status to indicate volume is no longer safe to mount. 254 err = rc.nodeStatusUpdater.UpdateNodeStatusForNode(logger, attachedVolume.NodeName) 255 if err != nil { 256 // Skip detaching this volume if unable to update node status 257 logger.Error(err, "UpdateNodeStatusForNode failed while attempting to report volume as attached", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 258 // Add volume back to ReportAsAttached if UpdateNodeStatusForNode call failed so that node status updater will add it back to VolumeAttached list. 259 // It is needed here too because DetachVolume is not call actually and we keep the data consistency for every reconcile. 260 rc.actualStateOfWorld.AddVolumeToReportAsAttached(logger, attachedVolume.VolumeName, attachedVolume.NodeName) 261 continue 262 } 263 264 // Trigger detach volume which requires verifying safe to detach step 265 // If forceDetatchTimeoutExpired is true, skip verifySafeToDetach check 266 // If the node has node.kubernetes.io/out-of-service taint with NoExecute effect, skip verifySafeToDetach check 267 logger.V(5).Info("Starting attacherDetacher.DetachVolume", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 268 if hasOutOfServiceTaint { 269 logger.V(4).Info("node has out-of-service taint", "node", klog.KRef("", string(attachedVolume.NodeName))) 270 } 271 verifySafeToDetach := !(forceDetatchTimeoutExpired || hasOutOfServiceTaint) 272 err = rc.attacherDetacher.DetachVolume(logger, attachedVolume.AttachedVolume, verifySafeToDetach, rc.actualStateOfWorld) 273 if err == nil { 274 if verifySafeToDetach { // normal detach 275 logger.Info("attacherDetacher.DetachVolume started", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 276 } else { // force detach 277 if forceDetatchTimeoutExpired { 278 metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonTimeout) 279 logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching", 280 "duration", rc.maxWaitForUnmountDuration, 281 "node", klog.KRef("", string(attachedVolume.NodeName)), 282 "volumeName", attachedVolume.VolumeName) 283 } else { 284 metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonOutOfService) 285 logger.Info("attacherDetacher.DetachVolume started: node has out-of-service taint, force detaching", 286 "node", klog.KRef("", string(attachedVolume.NodeName)), 287 "volumeName", attachedVolume.VolumeName) 288 } 289 } 290 } 291 if err != nil { 292 // Add volume back to ReportAsAttached if DetachVolume call failed so that node status updater will add it back to VolumeAttached list. 293 // This function is also called during executing the volume detach operation in operation_generoator. 294 // It is needed here too because DetachVolume call might fail before executing the actual operation in operation_executor (e.g., cannot find volume plugin etc.) 295 rc.actualStateOfWorld.AddVolumeToReportAsAttached(logger, attachedVolume.VolumeName, attachedVolume.NodeName) 296 297 if !exponentialbackoff.IsExponentialBackoff(err) { 298 // Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected. 299 // Log all other errors. 300 logger.Error(err, "attacherDetacher.DetachVolume failed to start", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 301 } 302 } 303 } 304 } 305 306 rc.attachDesiredVolumes(logger) 307 308 // Update Node Status 309 err := rc.nodeStatusUpdater.UpdateNodeStatuses(logger) 310 if err != nil { 311 logger.Info("UpdateNodeStatuses failed", "err", err) 312 } 313 } 314 315 func (rc *reconciler) attachDesiredVolumes(logger klog.Logger) { 316 // Ensure volumes that should be attached are attached. 317 for _, volumeToAttach := range rc.desiredStateOfWorld.GetVolumesToAttach() { 318 if util.IsMultiAttachAllowed(volumeToAttach.VolumeSpec) { 319 // Don't even try to start an operation if there is already one running for the given volume and node. 320 if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "" /* podName */, volumeToAttach.NodeName) { 321 logger.V(10).Info("Operation for volume is already running for node. Can't start attach", "node", klog.KRef("", string(volumeToAttach.NodeName)), "volumeName", volumeToAttach.VolumeName) 322 continue 323 } 324 } else { 325 // Don't even try to start an operation if there is already one running for the given volume 326 if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "" /* podName */, "" /* nodeName */) { 327 logger.V(10).Info("Operation for volume is already running. Can't start attach for node", "node", klog.KRef("", string(volumeToAttach.NodeName)), "volumeNames", volumeToAttach.VolumeName) 328 continue 329 } 330 } 331 332 // Because the attach operation updates the ActualStateOfWorld before 333 // marking itself complete, IsOperationPending() must be checked before 334 // GetAttachState() to guarantee the ActualStateOfWorld is 335 // up-to-date when it's read. 336 // See https://github.com/kubernetes/kubernetes/issues/93902 337 attachState := rc.actualStateOfWorld.GetAttachState(volumeToAttach.VolumeName, volumeToAttach.NodeName) 338 if attachState == cache.AttachStateAttached { 339 // Volume/Node exists, touch it to reset detachRequestedTime 340 logger.V(10).Info("Volume attached--touching", "volume", volumeToAttach) 341 rc.actualStateOfWorld.ResetDetachRequestTime(logger, volumeToAttach.VolumeName, volumeToAttach.NodeName) 342 continue 343 } 344 345 if !util.IsMultiAttachAllowed(volumeToAttach.VolumeSpec) { 346 nodes := rc.actualStateOfWorld.GetNodesForAttachedVolume(volumeToAttach.VolumeName) 347 if len(nodes) > 0 { 348 if !volumeToAttach.MultiAttachErrorReported { 349 rc.reportMultiAttachError(logger, volumeToAttach, nodes) 350 rc.desiredStateOfWorld.SetMultiAttachError(volumeToAttach.VolumeName, volumeToAttach.NodeName) 351 } 352 continue 353 } 354 } 355 356 // Volume/Node doesn't exist, spawn a goroutine to attach it 357 logger.V(5).Info("Starting attacherDetacher.AttachVolume", "volume", volumeToAttach) 358 err := rc.attacherDetacher.AttachVolume(logger, volumeToAttach.VolumeToAttach, rc.actualStateOfWorld) 359 if err == nil { 360 logger.Info("attacherDetacher.AttachVolume started", "volumeName", volumeToAttach.VolumeName, "nodeName", volumeToAttach.NodeName, "scheduledPods", klog.KObjSlice(volumeToAttach.ScheduledPods)) 361 } 362 if err != nil && !exponentialbackoff.IsExponentialBackoff(err) { 363 // Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected. 364 // Log all other errors. 365 logger.Error(err, "attacherDetacher.AttachVolume failed to start", "volumeName", volumeToAttach.VolumeName, "nodeName", volumeToAttach.NodeName, "scheduledPods", klog.KObjSlice(volumeToAttach.ScheduledPods)) 366 } 367 } 368 } 369 370 // reportMultiAttachError sends events and logs situation that a volume that 371 // should be attached to a node is already attached to different node(s). 372 func (rc *reconciler) reportMultiAttachError(logger klog.Logger, volumeToAttach cache.VolumeToAttach, nodes []types.NodeName) { 373 // Filter out the current node from list of nodes where the volume is 374 // attached. 375 // Some methods need []string, some other needs []NodeName, collect both. 376 // In theory, these arrays should have always only one element - the 377 // controller does not allow more than one attachment. But use array just 378 // in case... 379 otherNodes := []types.NodeName{} 380 otherNodesStr := []string{} 381 for _, node := range nodes { 382 if node != volumeToAttach.NodeName { 383 otherNodes = append(otherNodes, node) 384 otherNodesStr = append(otherNodesStr, string(node)) 385 } 386 } 387 388 // Get list of pods that use the volume on the other nodes. 389 pods := rc.desiredStateOfWorld.GetVolumePodsOnNodes(otherNodes, volumeToAttach.VolumeName) 390 if len(pods) == 0 { 391 // We did not find any pods that requests the volume. The pod must have been deleted already. 392 simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", "Volume is already exclusively attached to one node and can't be attached to another") 393 for _, pod := range volumeToAttach.ScheduledPods { 394 rc.recorder.Eventf(pod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg) 395 } 396 // Log detailed message to system admin 397 logger.Info("Multi-Attach error: volume is already exclusively attached and can't be attached to another node", "attachedTo", otherNodesStr, "volume", volumeToAttach) 398 return 399 } 400 401 // There are pods that require the volume and run on another node. Typically 402 // it's user error, e.g. a ReplicaSet uses a PVC and has >1 replicas. Let 403 // the user know what pods are blocking the volume. 404 for _, scheduledPod := range volumeToAttach.ScheduledPods { 405 // Each scheduledPod must get a custom message. They can run in 406 // different namespaces and user of a namespace should not see names of 407 // pods in other namespaces. 408 localPodNames := []string{} // Names of pods in scheduledPods's namespace 409 otherPods := 0 // Count of pods in other namespaces 410 for _, pod := range pods { 411 if pod.Namespace == scheduledPod.Namespace { 412 localPodNames = append(localPodNames, pod.Name) 413 } else { 414 otherPods++ 415 } 416 } 417 418 var msg string 419 if len(localPodNames) > 0 { 420 msg = fmt.Sprintf("Volume is already used by pod(s) %s", strings.Join(localPodNames, ", ")) 421 if otherPods > 0 { 422 msg = fmt.Sprintf("%s and %d pod(s) in different namespaces", msg, otherPods) 423 } 424 } else { 425 // No local pods, there are pods only in different namespaces. 426 msg = fmt.Sprintf("Volume is already used by %d pod(s) in different namespaces", otherPods) 427 } 428 simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", msg) 429 rc.recorder.Eventf(scheduledPod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg) 430 } 431 432 // Log all pods for system admin 433 logger.Info("Multi-Attach error: volume is already used by pods", "pods", klog.KObjSlice(pods), "attachedTo", otherNodesStr, "volume", volumeToAttach) 434 }