k8s.io/kubernetes@v1.29.3/pkg/controller/volume/attachdetach/reconciler/reconciler.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package reconciler implements interfaces that attempt to reconcile the 18 // desired state of the with the actual state of the world by triggering 19 // actions. 20 package reconciler 21 22 import ( 23 "context" 24 "fmt" 25 "strings" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/apimachinery/pkg/util/wait" 31 corelisters "k8s.io/client-go/listers/core/v1" 32 "k8s.io/client-go/tools/record" 33 "k8s.io/klog/v2" 34 "k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache" 35 "k8s.io/kubernetes/pkg/controller/volume/attachdetach/metrics" 36 "k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater" 37 kevents "k8s.io/kubernetes/pkg/kubelet/events" 38 "k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff" 39 nodeutil "k8s.io/kubernetes/pkg/util/node" 40 "k8s.io/kubernetes/pkg/util/taints" 41 "k8s.io/kubernetes/pkg/volume/util" 42 "k8s.io/kubernetes/pkg/volume/util/operationexecutor" 43 ) 44 45 // Reconciler runs a periodic loop to reconcile the desired state of the world with 46 // the actual state of the world by triggering attach detach operations. 47 // Note: This is distinct from the Reconciler implemented by the kubelet volume 48 // manager. This reconciles state for the attach/detach controller. That 49 // reconciles state for the kubelet volume manager. 50 type Reconciler interface { 51 // Starts running the reconciliation loop which executes periodically, checks 52 // if volumes that should be attached are attached and volumes that should 53 // be detached are detached. If not, it will trigger attach/detach 54 // operations to rectify. 55 Run(ctx context.Context) 56 } 57 58 // NewReconciler returns a new instance of Reconciler that waits loopPeriod 59 // between successive executions. 60 // loopPeriod is the amount of time the reconciler loop waits between 61 // successive executions. 62 // maxWaitForUnmountDuration is the max amount of time the reconciler will wait 63 // for the volume to be safely unmounted, after this it will detach the volume 64 // anyway (to handle crashed/unavailable nodes). If during this time the volume 65 // becomes used by a new pod, the detach request will be aborted and the timer 66 // cleared. 67 func NewReconciler( 68 loopPeriod time.Duration, 69 maxWaitForUnmountDuration time.Duration, 70 syncDuration time.Duration, 71 disableReconciliationSync bool, 72 desiredStateOfWorld cache.DesiredStateOfWorld, 73 actualStateOfWorld cache.ActualStateOfWorld, 74 attacherDetacher operationexecutor.OperationExecutor, 75 nodeStatusUpdater statusupdater.NodeStatusUpdater, 76 nodeLister corelisters.NodeLister, 77 recorder record.EventRecorder) Reconciler { 78 return &reconciler{ 79 loopPeriod: loopPeriod, 80 maxWaitForUnmountDuration: maxWaitForUnmountDuration, 81 syncDuration: syncDuration, 82 disableReconciliationSync: disableReconciliationSync, 83 desiredStateOfWorld: desiredStateOfWorld, 84 actualStateOfWorld: actualStateOfWorld, 85 attacherDetacher: attacherDetacher, 86 nodeStatusUpdater: nodeStatusUpdater, 87 nodeLister: nodeLister, 88 timeOfLastSync: time.Now(), 89 recorder: recorder, 90 } 91 } 92 93 type reconciler struct { 94 loopPeriod time.Duration 95 maxWaitForUnmountDuration time.Duration 96 syncDuration time.Duration 97 desiredStateOfWorld cache.DesiredStateOfWorld 98 actualStateOfWorld cache.ActualStateOfWorld 99 attacherDetacher operationexecutor.OperationExecutor 100 nodeStatusUpdater statusupdater.NodeStatusUpdater 101 nodeLister corelisters.NodeLister 102 timeOfLastSync time.Time 103 disableReconciliationSync bool 104 recorder record.EventRecorder 105 } 106 107 func (rc *reconciler) Run(ctx context.Context) { 108 wait.UntilWithContext(ctx, rc.reconciliationLoopFunc(ctx), rc.loopPeriod) 109 } 110 111 // reconciliationLoopFunc this can be disabled via cli option disableReconciliation. 112 // It periodically checks whether the attached volumes from actual state 113 // are still attached to the node and update the status if they are not. 114 func (rc *reconciler) reconciliationLoopFunc(ctx context.Context) func(context.Context) { 115 return func(ctx context.Context) { 116 117 rc.reconcile(ctx) 118 logger := klog.FromContext(ctx) 119 if rc.disableReconciliationSync { 120 logger.V(5).Info("Skipping reconciling attached volumes still attached since it is disabled via the command line") 121 } else if rc.syncDuration < time.Second { 122 logger.V(5).Info("Skipping reconciling attached volumes still attached since it is set to less than one second via the command line") 123 } else if time.Since(rc.timeOfLastSync) > rc.syncDuration { 124 logger.V(5).Info("Starting reconciling attached volumes still attached") 125 rc.sync() 126 } 127 } 128 } 129 130 func (rc *reconciler) sync() { 131 defer rc.updateSyncTime() 132 rc.syncStates() 133 } 134 135 func (rc *reconciler) updateSyncTime() { 136 rc.timeOfLastSync = time.Now() 137 } 138 139 func (rc *reconciler) syncStates() { 140 volumesPerNode := rc.actualStateOfWorld.GetAttachedVolumesPerNode() 141 rc.attacherDetacher.VerifyVolumesAreAttached(volumesPerNode, rc.actualStateOfWorld) 142 } 143 144 // hasOutOfServiceTaint returns true if the node has out-of-service taint present. 145 func (rc *reconciler) hasOutOfServiceTaint(nodeName types.NodeName) (bool, error) { 146 node, err := rc.nodeLister.Get(string(nodeName)) 147 if err != nil { 148 return false, err 149 } 150 return taints.TaintKeyExists(node.Spec.Taints, v1.TaintNodeOutOfService), nil 151 } 152 153 // nodeIsHealthy returns true if the node looks healthy. 154 func (rc *reconciler) nodeIsHealthy(nodeName types.NodeName) (bool, error) { 155 node, err := rc.nodeLister.Get(string(nodeName)) 156 if err != nil { 157 return false, err 158 } 159 return nodeutil.IsNodeReady(node), nil 160 } 161 162 func (rc *reconciler) reconcile(ctx context.Context) { 163 // Detaches are triggered before attaches so that volumes referenced by 164 // pods that are rescheduled to a different node are detached first. 165 166 // Ensure volumes that should be detached are detached. 167 logger := klog.FromContext(ctx) 168 for _, attachedVolume := range rc.actualStateOfWorld.GetAttachedVolumes() { 169 if !rc.desiredStateOfWorld.VolumeExists( 170 attachedVolume.VolumeName, attachedVolume.NodeName) { 171 172 // Check whether there already exist an operation pending, and don't even 173 // try to start an operation if there is already one running. 174 // This check must be done before we do any other checks, as otherwise the other checks 175 // may pass while at the same time the volume leaves the pending state, resulting in 176 // double detach attempts 177 // The operation key format is different depending on whether the volume 178 // allows multi attach across different nodes. 179 if util.IsMultiAttachAllowed(attachedVolume.VolumeSpec) { 180 if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, attachedVolume.NodeName, operationexecutor.DetachOperationName) { 181 logger.V(10).Info("Operation for volume is already running or still in exponential backoff for node. Can't start detach", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 182 continue 183 } 184 } else { 185 if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, "" /* nodeName */, operationexecutor.DetachOperationName) { 186 logger.V(10).Info("Operation for volume is already running or still in exponential backoff in the cluster. Can't start detach for node", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 187 continue 188 } 189 } 190 191 // Because the detach operation updates the ActualStateOfWorld before 192 // marking itself complete, it's possible for the volume to be removed 193 // from the ActualStateOfWorld between the GetAttachedVolumes() check 194 // and the IsOperationPending() check above. 195 // Check the ActualStateOfWorld again to avoid issuing an unnecessary 196 // detach. 197 // See https://github.com/kubernetes/kubernetes/issues/93902 198 attachState := rc.actualStateOfWorld.GetAttachState(attachedVolume.VolumeName, attachedVolume.NodeName) 199 if attachState == cache.AttachStateDetached { 200 logger.V(5).Info("Volume detached--skipping", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 201 continue 202 } 203 204 // Set the detach request time 205 elapsedTime, err := rc.actualStateOfWorld.SetDetachRequestTime(logger, attachedVolume.VolumeName, attachedVolume.NodeName) 206 if err != nil { 207 logger.Error(err, "Cannot trigger detach because it fails to set detach request time with error") 208 continue 209 } 210 // Check whether timeout has reached the maximum waiting time 211 timeout := elapsedTime > rc.maxWaitForUnmountDuration 212 213 isHealthy, err := rc.nodeIsHealthy(attachedVolume.NodeName) 214 if err != nil { 215 logger.Error(err, "Failed to get health of node", "node", klog.KRef("", string(attachedVolume.NodeName))) 216 } 217 218 // Force detach volumes from unhealthy nodes after maxWaitForUnmountDuration. 219 forceDetach := !isHealthy && timeout 220 221 hasOutOfServiceTaint, err := rc.hasOutOfServiceTaint(attachedVolume.NodeName) 222 if err != nil { 223 logger.Error(err, "Failed to get taint specs for node", "node", klog.KRef("", string(attachedVolume.NodeName))) 224 } 225 226 // Check whether volume is still mounted. Skip detach if it is still mounted unless force detach timeout 227 // or the node has `node.kubernetes.io/out-of-service` taint. 228 if attachedVolume.MountedByNode && !forceDetach && !hasOutOfServiceTaint { 229 logger.V(5).Info("Cannot detach volume because it is still mounted", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 230 continue 231 } 232 233 // Before triggering volume detach, mark volume as detached and update the node status 234 // If it fails to update node status, skip detach volume 235 // If volume detach operation fails, the volume needs to be added back to report as attached so that node status 236 // has the correct volume attachment information. 237 err = rc.actualStateOfWorld.RemoveVolumeFromReportAsAttached(attachedVolume.VolumeName, attachedVolume.NodeName) 238 if err != nil { 239 logger.V(5).Info("RemoveVolumeFromReportAsAttached failed while removing volume from node", 240 "node", klog.KRef("", string(attachedVolume.NodeName)), 241 "volumeName", attachedVolume.VolumeName, 242 "err", err) 243 } 244 245 // Update Node Status to indicate volume is no longer safe to mount. 246 err = rc.nodeStatusUpdater.UpdateNodeStatusForNode(logger, attachedVolume.NodeName) 247 if err != nil { 248 // Skip detaching this volume if unable to update node status 249 logger.Error(err, "UpdateNodeStatusForNode failed while attempting to report volume as attached", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 250 // Add volume back to ReportAsAttached if UpdateNodeStatusForNode call failed so that node status updater will add it back to VolumeAttached list. 251 // It is needed here too because DetachVolume is not call actually and we keep the data consistency for every reconcile. 252 rc.actualStateOfWorld.AddVolumeToReportAsAttached(logger, attachedVolume.VolumeName, attachedVolume.NodeName) 253 continue 254 } 255 256 // Trigger detach volume which requires verifying safe to detach step 257 // If timeout is true, skip verifySafeToDetach check 258 // If the node has node.kubernetes.io/out-of-service taint with NoExecute effect, skip verifySafeToDetach check 259 logger.V(5).Info("Starting attacherDetacher.DetachVolume", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 260 if hasOutOfServiceTaint { 261 logger.V(4).Info("node has out-of-service taint", "node", klog.KRef("", string(attachedVolume.NodeName))) 262 } 263 verifySafeToDetach := !(timeout || hasOutOfServiceTaint) 264 err = rc.attacherDetacher.DetachVolume(logger, attachedVolume.AttachedVolume, verifySafeToDetach, rc.actualStateOfWorld) 265 if err == nil { 266 if verifySafeToDetach { // normal detach 267 logger.Info("attacherDetacher.DetachVolume started", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 268 } else { // force detach 269 if timeout { 270 metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonTimeout) 271 logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching", 272 "duration", rc.maxWaitForUnmountDuration, 273 "node", klog.KRef("", string(attachedVolume.NodeName)), 274 "volumeName", attachedVolume.VolumeName) 275 } else { 276 metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonOutOfService) 277 logger.Info("attacherDetacher.DetachVolume started: node has out-of-service taint, force detaching", 278 "node", klog.KRef("", string(attachedVolume.NodeName)), 279 "volumeName", attachedVolume.VolumeName) 280 } 281 } 282 } 283 if err != nil { 284 // Add volume back to ReportAsAttached if DetachVolume call failed so that node status updater will add it back to VolumeAttached list. 285 // This function is also called during executing the volume detach operation in operation_generoator. 286 // It is needed here too because DetachVolume call might fail before executing the actual operation in operation_executor (e.g., cannot find volume plugin etc.) 287 rc.actualStateOfWorld.AddVolumeToReportAsAttached(logger, attachedVolume.VolumeName, attachedVolume.NodeName) 288 289 if !exponentialbackoff.IsExponentialBackoff(err) { 290 // Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected. 291 // Log all other errors. 292 logger.Error(err, "attacherDetacher.DetachVolume failed to start", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) 293 } 294 } 295 } 296 } 297 298 rc.attachDesiredVolumes(logger) 299 300 // Update Node Status 301 err := rc.nodeStatusUpdater.UpdateNodeStatuses(logger) 302 if err != nil { 303 logger.Info("UpdateNodeStatuses failed", "err", err) 304 } 305 } 306 307 func (rc *reconciler) attachDesiredVolumes(logger klog.Logger) { 308 // Ensure volumes that should be attached are attached. 309 for _, volumeToAttach := range rc.desiredStateOfWorld.GetVolumesToAttach() { 310 if util.IsMultiAttachAllowed(volumeToAttach.VolumeSpec) { 311 // Don't even try to start an operation if there is already one running for the given volume and node. 312 if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "" /* podName */, volumeToAttach.NodeName) { 313 logger.V(10).Info("Operation for volume is already running for node. Can't start attach", "node", klog.KRef("", string(volumeToAttach.NodeName)), "volumeName", volumeToAttach.VolumeName) 314 continue 315 } 316 } else { 317 // Don't even try to start an operation if there is already one running for the given volume 318 if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "" /* podName */, "" /* nodeName */) { 319 logger.V(10).Info("Operation for volume is already running. Can't start attach for node", "node", klog.KRef("", string(volumeToAttach.NodeName)), "volumeNames", volumeToAttach.VolumeName) 320 continue 321 } 322 } 323 324 // Because the attach operation updates the ActualStateOfWorld before 325 // marking itself complete, IsOperationPending() must be checked before 326 // GetAttachState() to guarantee the ActualStateOfWorld is 327 // up-to-date when it's read. 328 // See https://github.com/kubernetes/kubernetes/issues/93902 329 attachState := rc.actualStateOfWorld.GetAttachState(volumeToAttach.VolumeName, volumeToAttach.NodeName) 330 if attachState == cache.AttachStateAttached { 331 // Volume/Node exists, touch it to reset detachRequestedTime 332 logger.V(10).Info("Volume attached--touching", "volume", volumeToAttach) 333 rc.actualStateOfWorld.ResetDetachRequestTime(logger, volumeToAttach.VolumeName, volumeToAttach.NodeName) 334 continue 335 } 336 337 if !util.IsMultiAttachAllowed(volumeToAttach.VolumeSpec) { 338 nodes := rc.actualStateOfWorld.GetNodesForAttachedVolume(volumeToAttach.VolumeName) 339 if len(nodes) > 0 { 340 if !volumeToAttach.MultiAttachErrorReported { 341 rc.reportMultiAttachError(logger, volumeToAttach, nodes) 342 rc.desiredStateOfWorld.SetMultiAttachError(volumeToAttach.VolumeName, volumeToAttach.NodeName) 343 } 344 continue 345 } 346 } 347 348 // Volume/Node doesn't exist, spawn a goroutine to attach it 349 logger.V(5).Info("Starting attacherDetacher.AttachVolume", "volume", volumeToAttach) 350 err := rc.attacherDetacher.AttachVolume(logger, volumeToAttach.VolumeToAttach, rc.actualStateOfWorld) 351 if err == nil { 352 logger.Info("attacherDetacher.AttachVolume started", "volumeName", volumeToAttach.VolumeName, "nodeName", volumeToAttach.NodeName, "scheduledPods", klog.KObjSlice(volumeToAttach.ScheduledPods)) 353 } 354 if err != nil && !exponentialbackoff.IsExponentialBackoff(err) { 355 // Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected. 356 // Log all other errors. 357 logger.Error(err, "attacherDetacher.AttachVolume failed to start", "volumeName", volumeToAttach.VolumeName, "nodeName", volumeToAttach.NodeName, "scheduledPods", klog.KObjSlice(volumeToAttach.ScheduledPods)) 358 } 359 } 360 } 361 362 // reportMultiAttachError sends events and logs situation that a volume that 363 // should be attached to a node is already attached to different node(s). 364 func (rc *reconciler) reportMultiAttachError(logger klog.Logger, volumeToAttach cache.VolumeToAttach, nodes []types.NodeName) { 365 // Filter out the current node from list of nodes where the volume is 366 // attached. 367 // Some methods need []string, some other needs []NodeName, collect both. 368 // In theory, these arrays should have always only one element - the 369 // controller does not allow more than one attachment. But use array just 370 // in case... 371 otherNodes := []types.NodeName{} 372 otherNodesStr := []string{} 373 for _, node := range nodes { 374 if node != volumeToAttach.NodeName { 375 otherNodes = append(otherNodes, node) 376 otherNodesStr = append(otherNodesStr, string(node)) 377 } 378 } 379 380 // Get list of pods that use the volume on the other nodes. 381 pods := rc.desiredStateOfWorld.GetVolumePodsOnNodes(otherNodes, volumeToAttach.VolumeName) 382 if len(pods) == 0 { 383 // We did not find any pods that requests the volume. The pod must have been deleted already. 384 simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", "Volume is already exclusively attached to one node and can't be attached to another") 385 for _, pod := range volumeToAttach.ScheduledPods { 386 rc.recorder.Eventf(pod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg) 387 } 388 // Log detailed message to system admin 389 logger.Info("Multi-Attach error: volume is already exclusively attached and can't be attached to another node", "attachedTo", otherNodesStr, "volume", volumeToAttach) 390 return 391 } 392 393 // There are pods that require the volume and run on another node. Typically 394 // it's user error, e.g. a ReplicaSet uses a PVC and has >1 replicas. Let 395 // the user know what pods are blocking the volume. 396 for _, scheduledPod := range volumeToAttach.ScheduledPods { 397 // Each scheduledPod must get a custom message. They can run in 398 // different namespaces and user of a namespace should not see names of 399 // pods in other namespaces. 400 localPodNames := []string{} // Names of pods in scheduledPods's namespace 401 otherPods := 0 // Count of pods in other namespaces 402 for _, pod := range pods { 403 if pod.Namespace == scheduledPod.Namespace { 404 localPodNames = append(localPodNames, pod.Name) 405 } else { 406 otherPods++ 407 } 408 } 409 410 var msg string 411 if len(localPodNames) > 0 { 412 msg = fmt.Sprintf("Volume is already used by pod(s) %s", strings.Join(localPodNames, ", ")) 413 if otherPods > 0 { 414 msg = fmt.Sprintf("%s and %d pod(s) in different namespaces", msg, otherPods) 415 } 416 } else { 417 // No local pods, there are pods only in different namespaces. 418 msg = fmt.Sprintf("Volume is already used by %d pod(s) in different namespaces", otherPods) 419 } 420 simpleMsg, _ := volumeToAttach.GenerateMsg("Multi-Attach error", msg) 421 rc.recorder.Eventf(scheduledPod, v1.EventTypeWarning, kevents.FailedAttachVolume, simpleMsg) 422 } 423 424 // Log all pods for system admin 425 logger.Info("Multi-Attach error: volume is already used by pods", "pods", klog.KObjSlice(pods), "attachedTo", otherNodesStr, "volume", volumeToAttach) 426 }