k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/pod_workers.go (about) 1 /* 2 Copyright 2014 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/types" 28 "k8s.io/apimachinery/pkg/util/runtime" 29 "k8s.io/apimachinery/pkg/util/wait" 30 "k8s.io/client-go/tools/record" 31 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 32 "k8s.io/klog/v2" 33 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 34 "k8s.io/kubernetes/pkg/kubelet/events" 35 "k8s.io/kubernetes/pkg/kubelet/eviction" 36 "k8s.io/kubernetes/pkg/kubelet/metrics" 37 kubetypes "k8s.io/kubernetes/pkg/kubelet/types" 38 "k8s.io/kubernetes/pkg/kubelet/util/queue" 39 "k8s.io/utils/clock" 40 ) 41 42 // OnCompleteFunc is a function that is invoked when an operation completes. 43 // If err is non-nil, the operation did not complete successfully. 44 type OnCompleteFunc func(err error) 45 46 // PodStatusFunc is a function that is invoked to override the pod status when a pod is killed. 47 type PodStatusFunc func(podStatus *v1.PodStatus) 48 49 // KillPodOptions are options when performing a pod update whose update type is kill. 50 type KillPodOptions struct { 51 // CompletedCh is closed when the kill request completes (syncTerminatingPod has completed 52 // without error) or if the pod does not exist, or if the pod has already terminated. This 53 // could take an arbitrary amount of time to be closed, but is never left open once 54 // CouldHaveRunningContainers() returns false. 55 CompletedCh chan<- struct{} 56 // Evict is true if this is a pod triggered eviction - once a pod is evicted some resources are 57 // more aggressively reaped than during normal pod operation (stopped containers). 58 Evict bool 59 // PodStatusFunc is invoked (if set) and overrides the status of the pod at the time the pod is killed. 60 // The provided status is populated from the latest state. 61 PodStatusFunc PodStatusFunc 62 // PodTerminationGracePeriodSecondsOverride is optional override to use if a pod is being killed as part of kill operation. 63 PodTerminationGracePeriodSecondsOverride *int64 64 } 65 66 // UpdatePodOptions is an options struct to pass to a UpdatePod operation. 67 type UpdatePodOptions struct { 68 // The type of update (create, update, sync, kill). 69 UpdateType kubetypes.SyncPodType 70 // StartTime is an optional timestamp for when this update was created. If set, 71 // when this update is fully realized by the pod worker it will be recorded in 72 // the PodWorkerDuration metric. 73 StartTime time.Time 74 // Pod to update. Required. 75 Pod *v1.Pod 76 // MirrorPod is the mirror pod if Pod is a static pod. Optional when UpdateType 77 // is kill or terminated. 78 MirrorPod *v1.Pod 79 // RunningPod is a runtime pod that is no longer present in config. Required 80 // if Pod is nil, ignored if Pod is set. 81 RunningPod *kubecontainer.Pod 82 // KillPodOptions is used to override the default termination behavior of the 83 // pod or to update the pod status after an operation is completed. Since a 84 // pod can be killed for multiple reasons, PodStatusFunc is invoked in order 85 // and later kills have an opportunity to override the status (i.e. a preemption 86 // may be later turned into an eviction). 87 KillPodOptions *KillPodOptions 88 } 89 90 // PodWorkType classifies the status of pod as seen by the pod worker - setup (sync), 91 // teardown of containers (terminating), or cleanup (terminated). 92 type PodWorkerState int 93 94 const ( 95 // SyncPod is when the pod is expected to be started and running. 96 SyncPod PodWorkerState = iota 97 // TerminatingPod is when the pod is no longer being set up, but some 98 // containers may be running and are being torn down. 99 TerminatingPod 100 // TerminatedPod indicates the pod is stopped, can have no more running 101 // containers, and any foreground cleanup can be executed. 102 TerminatedPod 103 ) 104 105 func (state PodWorkerState) String() string { 106 switch state { 107 case SyncPod: 108 return "sync" 109 case TerminatingPod: 110 return "terminating" 111 case TerminatedPod: 112 return "terminated" 113 default: 114 panic(fmt.Sprintf("the state %d is not defined", state)) 115 } 116 } 117 118 // PodWorkerSync is the summarization of a single pod worker for sync. Values 119 // besides state are used to provide metric counts for operators. 120 type PodWorkerSync struct { 121 // State of the pod. 122 State PodWorkerState 123 // Orphan is true if the pod is no longer in the desired set passed to SyncKnownPods. 124 Orphan bool 125 // HasConfig is true if we have a historical pod spec for this pod. 126 HasConfig bool 127 // Static is true if we have config and the pod came from a static source. 128 Static bool 129 } 130 131 // podWork is the internal changes 132 type podWork struct { 133 // WorkType is the type of sync to perform - sync (create), terminating (stop 134 // containers), terminated (clean up and write status). 135 WorkType PodWorkerState 136 137 // Options contains the data to sync. 138 Options UpdatePodOptions 139 } 140 141 // PodWorkers is an abstract interface for testability. 142 type PodWorkers interface { 143 // UpdatePod notifies the pod worker of a change to a pod, which will then 144 // be processed in FIFO order by a goroutine per pod UID. The state of the 145 // pod will be passed to the syncPod method until either the pod is marked 146 // as deleted, it reaches a terminal phase (Succeeded/Failed), or the pod 147 // is evicted by the kubelet. Once that occurs the syncTerminatingPod method 148 // will be called until it exits successfully, and after that all further 149 // UpdatePod() calls will be ignored for that pod until it has been forgotten 150 // due to significant time passing. A pod that is terminated will never be 151 // restarted. 152 UpdatePod(options UpdatePodOptions) 153 // SyncKnownPods removes workers for pods that are not in the desiredPods set 154 // and have been terminated for a significant period of time. Once this method 155 // has been called once, the workers are assumed to be fully initialized and 156 // subsequent calls to ShouldPodContentBeRemoved on unknown pods will return 157 // true. It returns a map describing the state of each known pod worker. It 158 // is the responsibility of the caller to re-add any desired pods that are not 159 // returned as knownPods. 160 SyncKnownPods(desiredPods []*v1.Pod) (knownPods map[types.UID]PodWorkerSync) 161 162 // IsPodKnownTerminated returns true once SyncTerminatingPod completes 163 // successfully - the provided pod UID it is known by the pod 164 // worker to be terminated. If the pod has been force deleted and the pod worker 165 // has completed termination this method will return false, so this method should 166 // only be used to filter out pods from the desired set such as in admission. 167 // 168 // Intended for use by the kubelet config loops, but not subsystems, which should 169 // use ShouldPod*(). 170 IsPodKnownTerminated(uid types.UID) bool 171 // CouldHaveRunningContainers returns true before the pod workers have synced, 172 // once the pod workers see the pod (syncPod could be called), and returns false 173 // after the pod has been terminated (running containers guaranteed stopped). 174 // 175 // Intended for use by the kubelet config loops, but not subsystems, which should 176 // use ShouldPod*(). 177 CouldHaveRunningContainers(uid types.UID) bool 178 179 // ShouldPodBeFinished returns true once SyncTerminatedPod completes 180 // successfully - the provided pod UID it is known to the pod worker to 181 // be terminated and have resources reclaimed. It returns false before the 182 // pod workers have synced (syncPod could be called). Once the pod workers 183 // have synced it returns false if the pod has a sync status until 184 // SyncTerminatedPod completes successfully. If the pod workers have synced, 185 // but the pod does not have a status it returns true. 186 // 187 // Intended for use by subsystem sync loops to avoid performing background setup 188 // after termination has been requested for a pod. Callers must ensure that the 189 // syncPod method is non-blocking when their data is absent. 190 ShouldPodBeFinished(uid types.UID) bool 191 // IsPodTerminationRequested returns true when pod termination has been requested 192 // until the termination completes and the pod is removed from config. This should 193 // not be used in cleanup loops because it will return false if the pod has already 194 // been cleaned up - use ShouldPodContainersBeTerminating instead. Also, this method 195 // may return true while containers are still being initialized by the pod worker. 196 // 197 // Intended for use by the kubelet sync* methods, but not subsystems, which should 198 // use ShouldPod*(). 199 IsPodTerminationRequested(uid types.UID) bool 200 201 // ShouldPodContainersBeTerminating returns false before pod workers have synced, 202 // or once a pod has started terminating. This check is similar to 203 // ShouldPodRuntimeBeRemoved but is also true after pod termination is requested. 204 // 205 // Intended for use by subsystem sync loops to avoid performing background setup 206 // after termination has been requested for a pod. Callers must ensure that the 207 // syncPod method is non-blocking when their data is absent. 208 ShouldPodContainersBeTerminating(uid types.UID) bool 209 // ShouldPodRuntimeBeRemoved returns true if runtime managers within the Kubelet 210 // should aggressively cleanup pod resources that are not containers or on disk 211 // content, like attached volumes. This is true when a pod is not yet observed 212 // by a worker after the first sync (meaning it can't be running yet) or after 213 // all running containers are stopped. 214 // TODO: Once pod logs are separated from running containers, this method should 215 // be used to gate whether containers are kept. 216 // 217 // Intended for use by subsystem sync loops to know when to start tearing down 218 // resources that are used by running containers. Callers should ensure that 219 // runtime content they own is not required for post-termination - for instance 220 // containers are required in docker to preserve pod logs until after the pod 221 // is deleted. 222 ShouldPodRuntimeBeRemoved(uid types.UID) bool 223 // ShouldPodContentBeRemoved returns true if resource managers within the Kubelet 224 // should aggressively cleanup all content related to the pod. This is true 225 // during pod eviction (when we wish to remove that content to free resources) 226 // as well as after the request to delete a pod has resulted in containers being 227 // stopped (which is a more graceful action). Note that a deleting pod can still 228 // be evicted. 229 // 230 // Intended for use by subsystem sync loops to know when to start tearing down 231 // resources that are used by non-deleted pods. Content is generally preserved 232 // until deletion+removal_from_etcd or eviction, although garbage collection 233 // can free content when this method returns false. 234 ShouldPodContentBeRemoved(uid types.UID) bool 235 // IsPodForMirrorPodTerminatingByFullName returns true if a static pod with the 236 // provided pod name is currently terminating and has yet to complete. It is 237 // intended to be used only during orphan mirror pod cleanup to prevent us from 238 // deleting a terminating static pod from the apiserver before the pod is shut 239 // down. 240 IsPodForMirrorPodTerminatingByFullName(podFullname string) bool 241 } 242 243 // podSyncer describes the core lifecyle operations of the pod state machine. A pod is first 244 // synced until it naturally reaches termination (true is returned) or an external agent decides 245 // the pod should be terminated. Once a pod should be terminating, SyncTerminatingPod is invoked 246 // until it returns no error. Then the SyncTerminatedPod method is invoked until it exits without 247 // error, and the pod is considered terminal. Implementations of this interface must be threadsafe 248 // for simultaneous invocation of these methods for multiple pods. 249 type podSyncer interface { 250 // SyncPod configures the pod and starts and restarts all containers. If it returns true, the 251 // pod has reached a terminal state and the presence of the error indicates succeeded or failed. 252 // If an error is returned, the sync was not successful and should be rerun in the future. This 253 // is a long running method and should exit early with context.Canceled if the context is canceled. 254 SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) 255 // SyncTerminatingPod attempts to ensure the pod's containers are no longer running and to collect 256 // any final status. This method is repeatedly invoked with diminishing grace periods until it exits 257 // without error. Once this method exits with no error other components are allowed to tear down 258 // supporting resources like volumes and devices. If the context is canceled, the method should 259 // return context.Canceled unless it has successfully finished, which may occur when a shorter 260 // grace period is detected. 261 SyncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error 262 // SyncTerminatingRuntimePod is invoked when running containers are found that correspond to 263 // a pod that is no longer known to the kubelet to terminate those containers. It should not 264 // exit without error unless all containers are known to be stopped. 265 SyncTerminatingRuntimePod(ctx context.Context, runningPod *kubecontainer.Pod) error 266 // SyncTerminatedPod is invoked after all running containers are stopped and is responsible 267 // for releasing resources that should be executed right away rather than in the background. 268 // Once it exits without error the pod is considered finished on the node. 269 SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error 270 } 271 272 type syncPodFnType func(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) 273 type syncTerminatingPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error 274 type syncTerminatingRuntimePodFnType func(ctx context.Context, runningPod *kubecontainer.Pod) error 275 type syncTerminatedPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error 276 277 // podSyncerFuncs implements podSyncer and accepts functions for each method. 278 type podSyncerFuncs struct { 279 syncPod syncPodFnType 280 syncTerminatingPod syncTerminatingPodFnType 281 syncTerminatingRuntimePod syncTerminatingRuntimePodFnType 282 syncTerminatedPod syncTerminatedPodFnType 283 } 284 285 func newPodSyncerFuncs(s podSyncer) podSyncerFuncs { 286 return podSyncerFuncs{ 287 syncPod: s.SyncPod, 288 syncTerminatingPod: s.SyncTerminatingPod, 289 syncTerminatingRuntimePod: s.SyncTerminatingRuntimePod, 290 syncTerminatedPod: s.SyncTerminatedPod, 291 } 292 } 293 294 var _ podSyncer = podSyncerFuncs{} 295 296 func (f podSyncerFuncs) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { 297 return f.syncPod(ctx, updateType, pod, mirrorPod, podStatus) 298 } 299 func (f podSyncerFuncs) SyncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { 300 return f.syncTerminatingPod(ctx, pod, podStatus, gracePeriod, podStatusFn) 301 } 302 func (f podSyncerFuncs) SyncTerminatingRuntimePod(ctx context.Context, runningPod *kubecontainer.Pod) error { 303 return f.syncTerminatingRuntimePod(ctx, runningPod) 304 } 305 func (f podSyncerFuncs) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { 306 return f.syncTerminatedPod(ctx, pod, podStatus) 307 } 308 309 const ( 310 // jitter factor for resyncInterval 311 workerResyncIntervalJitterFactor = 0.5 312 313 // jitter factor for backOffPeriod and backOffOnTransientErrorPeriod 314 workerBackOffPeriodJitterFactor = 0.5 315 316 // backoff period when transient error occurred. 317 backOffOnTransientErrorPeriod = time.Second 318 ) 319 320 // podSyncStatus tracks per-pod transitions through the three phases of pod 321 // worker sync (setup, terminating, terminated). 322 type podSyncStatus struct { 323 // ctx is the context that is associated with the current pod sync. 324 // TODO: remove this from the struct by having the context initialized 325 // in startPodSync, the cancelFn used by UpdatePod, and cancellation of 326 // a parent context for tearing down workers (if needed) on shutdown 327 ctx context.Context 328 // cancelFn if set is expected to cancel the current podSyncer operation. 329 cancelFn context.CancelFunc 330 331 // fullname of the pod 332 fullname string 333 334 // working is true if an update is pending or being worked by a pod worker 335 // goroutine. 336 working bool 337 // pendingUpdate is the updated state the pod worker should observe. It is 338 // cleared and moved to activeUpdate when a pod worker reads it. A new update 339 // may always replace a pending update as the pod worker does not guarantee 340 // that all intermediate states are synced to a worker, only the most recent. 341 // This state will not be visible to downstream components until a pod worker 342 // has begun processing it. 343 pendingUpdate *UpdatePodOptions 344 // activeUpdate is the most recent version of the pod's state that will be 345 // passed to a sync*Pod function. A pod becomes visible to downstream components 346 // once a worker decides to start a pod (startedAt is set). The pod and mirror 347 // pod fields are accumulated if they are missing on a particular call (the last 348 // known version), and the value of KillPodOptions is accumulated as pods cannot 349 // have their grace period shortened. This is the source of truth for the pod spec 350 // the kubelet is reconciling towards for all components that act on running pods. 351 activeUpdate *UpdatePodOptions 352 353 // syncedAt is the time at which the pod worker first observed this pod. 354 syncedAt time.Time 355 // startedAt is the time at which the pod worker allowed the pod to start. 356 startedAt time.Time 357 // terminatingAt is set once the pod is requested to be killed - note that 358 // this can be set before the pod worker starts terminating the pod, see 359 // terminating. 360 terminatingAt time.Time 361 // terminatedAt is set once the pod worker has completed a successful 362 // syncTerminatingPod call and means all running containers are stopped. 363 terminatedAt time.Time 364 // gracePeriod is the requested gracePeriod once terminatingAt is nonzero. 365 gracePeriod int64 366 // notifyPostTerminating will be closed once the pod transitions to 367 // terminated. After the pod is in terminated state, nothing should be 368 // added to this list. 369 notifyPostTerminating []chan<- struct{} 370 // statusPostTerminating is a list of the status changes associated 371 // with kill pod requests. After the pod is in terminated state, nothing 372 // should be added to this list. The worker will execute the last function 373 // in this list on each termination attempt. 374 statusPostTerminating []PodStatusFunc 375 376 // startedTerminating is true once the pod worker has observed the request to 377 // stop a pod (exited syncPod and observed a podWork with WorkType 378 // TerminatingPod). Once this is set, it is safe for other components 379 // of the kubelet to assume that no other containers may be started. 380 startedTerminating bool 381 // deleted is true if the pod has been marked for deletion on the apiserver 382 // or has no configuration represented (was deleted before). 383 deleted bool 384 // evicted is true if the kill indicated this was an eviction (an evicted 385 // pod can be more aggressively cleaned up). 386 evicted bool 387 // finished is true once the pod worker completes for a pod 388 // (syncTerminatedPod exited with no errors) until SyncKnownPods is invoked 389 // to remove the pod. A terminal pod (Succeeded/Failed) will have 390 // termination status until the pod is deleted. 391 finished bool 392 // restartRequested is true if the pod worker was informed the pod is 393 // expected to exist (update type of create, update, or sync) after 394 // it has been killed. When known pods are synced, any pod that is 395 // terminated and has restartRequested will have its history cleared. 396 restartRequested bool 397 // observedRuntime is true if the pod has been observed to be present in the 398 // runtime. A pod that has been observed at runtime must go through either 399 // SyncTerminatingRuntimePod or SyncTerminatingPod. Otherwise, we can avoid 400 // invoking the terminating methods if the pod is deleted or orphaned before 401 // it has been started. 402 observedRuntime bool 403 } 404 405 func (s *podSyncStatus) IsWorking() bool { return s.working } 406 func (s *podSyncStatus) IsTerminationRequested() bool { return !s.terminatingAt.IsZero() } 407 func (s *podSyncStatus) IsTerminationStarted() bool { return s.startedTerminating } 408 func (s *podSyncStatus) IsTerminated() bool { return !s.terminatedAt.IsZero() } 409 func (s *podSyncStatus) IsFinished() bool { return s.finished } 410 func (s *podSyncStatus) IsEvicted() bool { return s.evicted } 411 func (s *podSyncStatus) IsDeleted() bool { return s.deleted } 412 func (s *podSyncStatus) IsStarted() bool { return !s.startedAt.IsZero() } 413 414 // WorkType returns this pods' current state of the pod in pod lifecycle state machine. 415 func (s *podSyncStatus) WorkType() PodWorkerState { 416 if s.IsTerminated() { 417 return TerminatedPod 418 } 419 if s.IsTerminationRequested() { 420 return TerminatingPod 421 } 422 return SyncPod 423 } 424 425 // mergeLastUpdate records the most recent state from a new update. Pod and MirrorPod are 426 // incremented. KillPodOptions is accumulated. If RunningPod is set, Pod is synthetic and 427 // will *not* be used as the last pod state unless no previous pod state exists (because 428 // the pod worker may be responsible for terminating a pod from a previous run of the 429 // kubelet where no config state is visible). The contents of activeUpdate are used as the 430 // source of truth for components downstream of the pod workers. 431 func (s *podSyncStatus) mergeLastUpdate(other UpdatePodOptions) { 432 opts := s.activeUpdate 433 if opts == nil { 434 opts = &UpdatePodOptions{} 435 s.activeUpdate = opts 436 } 437 438 // UpdatePodOptions states (and UpdatePod enforces) that either Pod or RunningPod 439 // is set, and we wish to preserve the most recent Pod we have observed, so only 440 // overwrite our Pod when we have no Pod or when RunningPod is nil. 441 if opts.Pod == nil || other.RunningPod == nil { 442 opts.Pod = other.Pod 443 } 444 // running pods will not persist but will be remembered for replay 445 opts.RunningPod = other.RunningPod 446 // if mirrorPod was not provided, remember the last one for replay 447 if other.MirrorPod != nil { 448 opts.MirrorPod = other.MirrorPod 449 } 450 // accumulate kill pod options 451 if other.KillPodOptions != nil { 452 opts.KillPodOptions = &KillPodOptions{} 453 if other.KillPodOptions.Evict { 454 opts.KillPodOptions.Evict = true 455 } 456 if override := other.KillPodOptions.PodTerminationGracePeriodSecondsOverride; override != nil { 457 value := *override 458 opts.KillPodOptions.PodTerminationGracePeriodSecondsOverride = &value 459 } 460 } 461 // StartTime is not copied - that is purely for tracking latency of config propagation 462 // from kubelet to pod worker. 463 } 464 465 // podWorkers keeps track of operations on pods and ensures each pod is 466 // reconciled with the container runtime and other subsystems. The worker 467 // also tracks which pods are in flight for starting, which pods are 468 // shutting down but still have running containers, and which pods have 469 // terminated recently and are guaranteed to have no running containers. 470 // 471 // podWorkers is the source of truth for what pods should be active on a 472 // node at any time, and is kept up to date with the desired state of the 473 // node (tracked by the kubelet pod config loops and the state in the 474 // kubelet's podManager) via the UpdatePod method. Components that act 475 // upon running pods should look to the pod worker for state instead of the 476 // kubelet podManager. The pod worker is periodically reconciled with the 477 // state of the podManager via SyncKnownPods() and is responsible for 478 // ensuring the completion of all observed pods no longer present in 479 // the podManager (no longer part of the node's desired config). 480 // 481 // A pod passed to a pod worker is either being synced (expected to be 482 // running), terminating (has running containers but no new containers are 483 // expected to start), terminated (has no running containers but may still 484 // have resources being consumed), or cleaned up (no resources remaining). 485 // Once a pod is set to be "torn down" it cannot be started again for that 486 // UID (corresponding to a delete or eviction) until: 487 // 488 // 1. The pod worker is finalized (syncTerminatingPod and 489 // syncTerminatedPod exit without error sequentially) 490 // 2. The SyncKnownPods method is invoked by kubelet housekeeping and the pod 491 // is not part of the known config. 492 // 493 // Pod workers provide a consistent source of information to other kubelet 494 // loops about the status of the pod and whether containers can be 495 // running. The ShouldPodContentBeRemoved() method tracks whether a pod's 496 // contents should still exist, which includes non-existent pods after 497 // SyncKnownPods() has been called once (as per the contract, all existing 498 // pods should be provided via UpdatePod before SyncKnownPods is invoked). 499 // Generally other sync loops are expected to separate "setup" and 500 // "teardown" responsibilities and the information methods here assist in 501 // each by centralizing that state. A simple visualization of the time 502 // intervals involved might look like: 503 // 504 // ---| = kubelet config has synced at least once 505 // -------| |- = pod exists in apiserver config 506 // --------| |---------------- = CouldHaveRunningContainers() is true 507 // 508 // ^- pod is observed by pod worker . 509 // . . 510 // 511 // ----------| |------------------------- = syncPod is running 512 // 513 // . ^- pod worker loop sees change and invokes syncPod 514 // . . . 515 // 516 // --------------| |------- = ShouldPodContainersBeTerminating() returns true 517 // --------------| |------- = IsPodTerminationRequested() returns true (pod is known) 518 // 519 // . . ^- Kubelet evicts pod . 520 // . . . 521 // 522 // -------------------| |---------------- = syncTerminatingPod runs then exits without error 523 // 524 // . . ^ pod worker loop exits syncPod, sees pod is terminating, 525 // . . invokes syncTerminatingPod 526 // . . . 527 // 528 // ---| |------------------| . = ShouldPodRuntimeBeRemoved() returns true (post-sync) 529 // 530 // . ^ syncTerminatingPod has exited successfully 531 // . . 532 // 533 // ----------------------------| |------- = syncTerminatedPod runs then exits without error 534 // 535 // . ^ other loops can tear down 536 // . . 537 // 538 // ------------------------------------| |---- = status manager is waiting for SyncTerminatedPod() finished 539 // 540 // . ^ . 541 // 542 // ----------| |- = status manager can be writing pod status 543 // 544 // ^ status manager deletes pod because no longer exists in config 545 // 546 // Other components in the Kubelet can request a termination of the pod 547 // via the UpdatePod method or the killPodNow wrapper - this will ensure 548 // the components of the pod are stopped until the kubelet is restarted 549 // or permanently (if the phase of the pod is set to a terminal phase 550 // in the pod status change). 551 type podWorkers struct { 552 // Protects all per worker fields. 553 podLock sync.Mutex 554 // podsSynced is true once the pod worker has been synced at least once, 555 // which means that all working pods have been started via UpdatePod(). 556 podsSynced bool 557 558 // Tracks all running per-pod goroutines - per-pod goroutine will be 559 // processing updates received through its corresponding channel. Sending 560 // a message on this channel will signal the corresponding goroutine to 561 // consume podSyncStatuses[uid].pendingUpdate if set. 562 podUpdates map[types.UID]chan struct{} 563 // Tracks by UID the termination status of a pod - syncing, terminating, 564 // terminated, and evicted. 565 podSyncStatuses map[types.UID]*podSyncStatus 566 567 // Tracks all uids for started static pods by full name 568 startedStaticPodsByFullname map[string]types.UID 569 // Tracks all uids for static pods that are waiting to start by full name 570 waitingToStartStaticPodsByFullname map[string][]types.UID 571 572 workQueue queue.WorkQueue 573 574 // This function is run to sync the desired state of pod. 575 // NOTE: This function has to be thread-safe - it can be called for 576 // different pods at the same time. 577 podSyncer podSyncer 578 579 // workerChannelFn is exposed for testing to allow unit tests to impose delays 580 // in channel communication. The function is invoked once each time a new worker 581 // goroutine starts. 582 workerChannelFn func(uid types.UID, in chan struct{}) (out <-chan struct{}) 583 584 // The EventRecorder to use 585 recorder record.EventRecorder 586 587 // backOffPeriod is the duration to back off when there is a sync error. 588 backOffPeriod time.Duration 589 590 // resyncInterval is the duration to wait until the next sync. 591 resyncInterval time.Duration 592 593 // podCache stores kubecontainer.PodStatus for all pods. 594 podCache kubecontainer.Cache 595 596 // clock is used for testing timing 597 clock clock.PassiveClock 598 } 599 600 func newPodWorkers( 601 podSyncer podSyncer, 602 recorder record.EventRecorder, 603 workQueue queue.WorkQueue, 604 resyncInterval, backOffPeriod time.Duration, 605 podCache kubecontainer.Cache, 606 ) PodWorkers { 607 return &podWorkers{ 608 podSyncStatuses: map[types.UID]*podSyncStatus{}, 609 podUpdates: map[types.UID]chan struct{}{}, 610 startedStaticPodsByFullname: map[string]types.UID{}, 611 waitingToStartStaticPodsByFullname: map[string][]types.UID{}, 612 podSyncer: podSyncer, 613 recorder: recorder, 614 workQueue: workQueue, 615 resyncInterval: resyncInterval, 616 backOffPeriod: backOffPeriod, 617 podCache: podCache, 618 clock: clock.RealClock{}, 619 } 620 } 621 622 func (p *podWorkers) IsPodKnownTerminated(uid types.UID) bool { 623 p.podLock.Lock() 624 defer p.podLock.Unlock() 625 if status, ok := p.podSyncStatuses[uid]; ok { 626 return status.IsTerminated() 627 } 628 // if the pod is not known, we return false (pod worker is not aware of it) 629 return false 630 } 631 632 func (p *podWorkers) CouldHaveRunningContainers(uid types.UID) bool { 633 p.podLock.Lock() 634 defer p.podLock.Unlock() 635 if status, ok := p.podSyncStatuses[uid]; ok { 636 return !status.IsTerminated() 637 } 638 // once all pods are synced, any pod without sync status is known to not be running. 639 return !p.podsSynced 640 } 641 642 func (p *podWorkers) ShouldPodBeFinished(uid types.UID) bool { 643 p.podLock.Lock() 644 defer p.podLock.Unlock() 645 if status, ok := p.podSyncStatuses[uid]; ok { 646 return status.IsFinished() 647 } 648 // once all pods are synced, any pod without sync status is assumed to 649 // have SyncTerminatedPod finished. 650 return p.podsSynced 651 } 652 653 func (p *podWorkers) IsPodTerminationRequested(uid types.UID) bool { 654 p.podLock.Lock() 655 defer p.podLock.Unlock() 656 if status, ok := p.podSyncStatuses[uid]; ok { 657 // the pod may still be setting up at this point. 658 return status.IsTerminationRequested() 659 } 660 // an unknown pod is considered not to be terminating (use ShouldPodContainersBeTerminating in 661 // cleanup loops to avoid failing to cleanup pods that have already been removed from config) 662 return false 663 } 664 665 func (p *podWorkers) ShouldPodContainersBeTerminating(uid types.UID) bool { 666 p.podLock.Lock() 667 defer p.podLock.Unlock() 668 if status, ok := p.podSyncStatuses[uid]; ok { 669 // we wait until the pod worker goroutine observes the termination, which means syncPod will not 670 // be executed again, which means no new containers can be started 671 return status.IsTerminationStarted() 672 } 673 // once we've synced, if the pod isn't known to the workers we should be tearing them 674 // down 675 return p.podsSynced 676 } 677 678 func (p *podWorkers) ShouldPodRuntimeBeRemoved(uid types.UID) bool { 679 p.podLock.Lock() 680 defer p.podLock.Unlock() 681 if status, ok := p.podSyncStatuses[uid]; ok { 682 return status.IsTerminated() 683 } 684 // a pod that hasn't been sent to the pod worker yet should have no runtime components once we have 685 // synced all content. 686 return p.podsSynced 687 } 688 689 func (p *podWorkers) ShouldPodContentBeRemoved(uid types.UID) bool { 690 p.podLock.Lock() 691 defer p.podLock.Unlock() 692 if status, ok := p.podSyncStatuses[uid]; ok { 693 return status.IsEvicted() || (status.IsDeleted() && status.IsTerminated()) 694 } 695 // a pod that hasn't been sent to the pod worker yet should have no content on disk once we have 696 // synced all content. 697 return p.podsSynced 698 } 699 700 func (p *podWorkers) IsPodForMirrorPodTerminatingByFullName(podFullName string) bool { 701 p.podLock.Lock() 702 defer p.podLock.Unlock() 703 uid, started := p.startedStaticPodsByFullname[podFullName] 704 if !started { 705 return false 706 } 707 status, exists := p.podSyncStatuses[uid] 708 if !exists { 709 return false 710 } 711 if !status.IsTerminationRequested() || status.IsTerminated() { 712 return false 713 } 714 715 return true 716 } 717 718 func isPodStatusCacheTerminal(status *kubecontainer.PodStatus) bool { 719 for _, container := range status.ContainerStatuses { 720 if container.State == kubecontainer.ContainerStateRunning { 721 return false 722 } 723 } 724 for _, sb := range status.SandboxStatuses { 725 if sb.State == runtimeapi.PodSandboxState_SANDBOX_READY { 726 return false 727 } 728 } 729 return true 730 } 731 732 // UpdatePod carries a configuration change or termination state to a pod. A pod is either runnable, 733 // terminating, or terminated, and will transition to terminating if: deleted on the apiserver, 734 // discovered to have a terminal phase (Succeeded or Failed), or evicted by the kubelet. 735 func (p *podWorkers) UpdatePod(options UpdatePodOptions) { 736 // Handle when the pod is an orphan (no config) and we only have runtime status by running only 737 // the terminating part of the lifecycle. A running pod contains only a minimal set of information 738 // about the pod 739 var isRuntimePod bool 740 var uid types.UID 741 var name, ns string 742 if runningPod := options.RunningPod; runningPod != nil { 743 if options.Pod == nil { 744 // the sythetic pod created here is used only as a placeholder and not tracked 745 if options.UpdateType != kubetypes.SyncPodKill { 746 klog.InfoS("Pod update is ignored, runtime pods can only be killed", "pod", klog.KRef(runningPod.Namespace, runningPod.Name), "podUID", runningPod.ID, "updateType", options.UpdateType) 747 return 748 } 749 uid, ns, name = runningPod.ID, runningPod.Namespace, runningPod.Name 750 isRuntimePod = true 751 } else { 752 options.RunningPod = nil 753 uid, ns, name = options.Pod.UID, options.Pod.Namespace, options.Pod.Name 754 klog.InfoS("Pod update included RunningPod which is only valid when Pod is not specified", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 755 } 756 } else { 757 uid, ns, name = options.Pod.UID, options.Pod.Namespace, options.Pod.Name 758 } 759 760 p.podLock.Lock() 761 defer p.podLock.Unlock() 762 763 // decide what to do with this pod - we are either setting it up, tearing it down, or ignoring it 764 var firstTime bool 765 now := p.clock.Now() 766 status, ok := p.podSyncStatuses[uid] 767 if !ok { 768 klog.V(4).InfoS("Pod is being synced for the first time", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 769 firstTime = true 770 status = &podSyncStatus{ 771 syncedAt: now, 772 fullname: kubecontainer.BuildPodFullName(name, ns), 773 } 774 // if this pod is being synced for the first time, we need to make sure it is an active pod 775 if options.Pod != nil && (options.Pod.Status.Phase == v1.PodFailed || options.Pod.Status.Phase == v1.PodSucceeded) { 776 // Check to see if the pod is not running and the pod is terminal; if this succeeds then record in the podWorker that it is terminated. 777 // This is needed because after a kubelet restart, we need to ensure terminal pods will NOT be considered active in Pod Admission. See http://issues.k8s.io/105523 778 // However, `filterOutInactivePods`, considers pods that are actively terminating as active. As a result, `IsPodKnownTerminated()` needs to return true and thus `terminatedAt` needs to be set. 779 if statusCache, err := p.podCache.Get(uid); err == nil { 780 if isPodStatusCacheTerminal(statusCache) { 781 // At this point we know: 782 // (1) The pod is terminal based on the config source. 783 // (2) The pod is terminal based on the runtime cache. 784 // This implies that this pod had already completed `SyncTerminatingPod` sometime in the past. The pod is likely being synced for the first time due to a kubelet restart. 785 // These pods need to complete SyncTerminatedPod to ensure that all resources are cleaned and that the status manager makes the final status updates for the pod. 786 // As a result, set finished: false, to ensure a Terminated event will be sent and `SyncTerminatedPod` will run. 787 status = &podSyncStatus{ 788 terminatedAt: now, 789 terminatingAt: now, 790 syncedAt: now, 791 startedTerminating: true, 792 finished: false, 793 fullname: kubecontainer.BuildPodFullName(name, ns), 794 } 795 } 796 } 797 } 798 p.podSyncStatuses[uid] = status 799 } 800 801 // RunningPods represent an unknown pod execution and don't contain pod spec information 802 // sufficient to perform any action other than termination. If we received a RunningPod 803 // after a real pod has already been provided, use the most recent spec instead. Also, 804 // once we observe a runtime pod we must drive it to completion, even if we weren't the 805 // ones who started it. 806 pod := options.Pod 807 if isRuntimePod { 808 status.observedRuntime = true 809 switch { 810 case status.pendingUpdate != nil && status.pendingUpdate.Pod != nil: 811 pod = status.pendingUpdate.Pod 812 options.Pod = pod 813 options.RunningPod = nil 814 case status.activeUpdate != nil && status.activeUpdate.Pod != nil: 815 pod = status.activeUpdate.Pod 816 options.Pod = pod 817 options.RunningPod = nil 818 default: 819 // we will continue to use RunningPod.ToAPIPod() as pod here, but 820 // options.Pod will be nil and other methods must handle that appropriately. 821 pod = options.RunningPod.ToAPIPod() 822 } 823 } 824 825 // When we see a create update on an already terminating pod, that implies two pods with the same UID were created in 826 // close temporal proximity (usually static pod but it's possible for an apiserver to extremely rarely do something 827 // similar) - flag the sync status to indicate that after the pod terminates it should be reset to "not running" to 828 // allow a subsequent add/update to start the pod worker again. This does not apply to the first time we see a pod, 829 // such as when the kubelet restarts and we see already terminated pods for the first time. 830 if !firstTime && status.IsTerminationRequested() { 831 if options.UpdateType == kubetypes.SyncPodCreate { 832 status.restartRequested = true 833 klog.V(4).InfoS("Pod is terminating but has been requested to restart with same UID, will be reconciled later", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 834 return 835 } 836 } 837 838 // once a pod is terminated by UID, it cannot reenter the pod worker (until the UID is purged by housekeeping) 839 if status.IsFinished() { 840 klog.V(4).InfoS("Pod is finished processing, no further updates", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 841 return 842 } 843 844 // check for a transition to terminating 845 var becameTerminating bool 846 if !status.IsTerminationRequested() { 847 switch { 848 case isRuntimePod: 849 klog.V(4).InfoS("Pod is orphaned and must be torn down", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 850 status.deleted = true 851 status.terminatingAt = now 852 becameTerminating = true 853 case pod.DeletionTimestamp != nil: 854 klog.V(4).InfoS("Pod is marked for graceful deletion, begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 855 status.deleted = true 856 status.terminatingAt = now 857 becameTerminating = true 858 case pod.Status.Phase == v1.PodFailed, pod.Status.Phase == v1.PodSucceeded: 859 klog.V(4).InfoS("Pod is in a terminal phase (success/failed), begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 860 status.terminatingAt = now 861 becameTerminating = true 862 case options.UpdateType == kubetypes.SyncPodKill: 863 if options.KillPodOptions != nil && options.KillPodOptions.Evict { 864 klog.V(4).InfoS("Pod is being evicted by the kubelet, begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 865 status.evicted = true 866 } else { 867 klog.V(4).InfoS("Pod is being removed by the kubelet, begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 868 } 869 status.terminatingAt = now 870 becameTerminating = true 871 } 872 } 873 874 // once a pod is terminating, all updates are kills and the grace period can only decrease 875 var wasGracePeriodShortened bool 876 switch { 877 case status.IsTerminated(): 878 // A terminated pod may still be waiting for cleanup - if we receive a runtime pod kill request 879 // due to housekeeping seeing an older cached version of the runtime pod simply ignore it until 880 // after the pod worker completes. 881 if isRuntimePod { 882 klog.V(3).InfoS("Pod is waiting for termination, ignoring runtime-only kill until after pod worker is fully terminated", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) 883 return 884 } 885 886 if options.KillPodOptions != nil { 887 if ch := options.KillPodOptions.CompletedCh; ch != nil { 888 close(ch) 889 } 890 } 891 options.KillPodOptions = nil 892 893 case status.IsTerminationRequested(): 894 if options.KillPodOptions == nil { 895 options.KillPodOptions = &KillPodOptions{} 896 } 897 898 if ch := options.KillPodOptions.CompletedCh; ch != nil { 899 status.notifyPostTerminating = append(status.notifyPostTerminating, ch) 900 } 901 if fn := options.KillPodOptions.PodStatusFunc; fn != nil { 902 status.statusPostTerminating = append(status.statusPostTerminating, fn) 903 } 904 905 gracePeriod, gracePeriodShortened := calculateEffectiveGracePeriod(status, pod, options.KillPodOptions) 906 907 wasGracePeriodShortened = gracePeriodShortened 908 status.gracePeriod = gracePeriod 909 // always set the grace period for syncTerminatingPod so we don't have to recalculate, 910 // will never be zero. 911 options.KillPodOptions.PodTerminationGracePeriodSecondsOverride = &gracePeriod 912 913 default: 914 // KillPodOptions is not valid for sync actions outside of the terminating phase 915 if options.KillPodOptions != nil { 916 if ch := options.KillPodOptions.CompletedCh; ch != nil { 917 close(ch) 918 } 919 options.KillPodOptions = nil 920 } 921 } 922 923 // start the pod worker goroutine if it doesn't exist 924 podUpdates, exists := p.podUpdates[uid] 925 if !exists { 926 // buffer the channel to avoid blocking this method 927 podUpdates = make(chan struct{}, 1) 928 p.podUpdates[uid] = podUpdates 929 930 // ensure that static pods start in the order they are received by UpdatePod 931 if kubetypes.IsStaticPod(pod) { 932 p.waitingToStartStaticPodsByFullname[status.fullname] = 933 append(p.waitingToStartStaticPodsByFullname[status.fullname], uid) 934 } 935 936 // allow testing of delays in the pod update channel 937 var outCh <-chan struct{} 938 if p.workerChannelFn != nil { 939 outCh = p.workerChannelFn(uid, podUpdates) 940 } else { 941 outCh = podUpdates 942 } 943 944 // spawn a pod worker 945 go func() { 946 // TODO: this should be a wait.Until with backoff to handle panics, and 947 // accept a context for shutdown 948 defer runtime.HandleCrash() 949 defer klog.V(3).InfoS("Pod worker has stopped", "podUID", uid) 950 p.podWorkerLoop(uid, outCh) 951 }() 952 } 953 954 // measure the maximum latency between a call to UpdatePod and when the pod worker reacts to it 955 // by preserving the oldest StartTime 956 if status.pendingUpdate != nil && !status.pendingUpdate.StartTime.IsZero() && status.pendingUpdate.StartTime.Before(options.StartTime) { 957 options.StartTime = status.pendingUpdate.StartTime 958 } 959 960 // notify the pod worker there is a pending update 961 status.pendingUpdate = &options 962 status.working = true 963 klog.V(4).InfoS("Notifying pod of pending update", "pod", klog.KRef(ns, name), "podUID", uid, "workType", status.WorkType()) 964 select { 965 case podUpdates <- struct{}{}: 966 default: 967 } 968 969 if (becameTerminating || wasGracePeriodShortened) && status.cancelFn != nil { 970 klog.V(3).InfoS("Cancelling current pod sync", "pod", klog.KRef(ns, name), "podUID", uid, "workType", status.WorkType()) 971 status.cancelFn() 972 return 973 } 974 } 975 976 // calculateEffectiveGracePeriod sets the initial grace period for a newly terminating pod or allows a 977 // shorter grace period to be provided, returning the desired value. 978 func calculateEffectiveGracePeriod(status *podSyncStatus, pod *v1.Pod, options *KillPodOptions) (int64, bool) { 979 // enforce the restriction that a grace period can only decrease and track whatever our value is, 980 // then ensure a calculated value is passed down to lower levels 981 gracePeriod := status.gracePeriod 982 overridden := false 983 // this value is bedrock truth - the apiserver owns telling us this value calculated by apiserver 984 if override := pod.DeletionGracePeriodSeconds; override != nil { 985 if gracePeriod == 0 || *override < gracePeriod { 986 gracePeriod = *override 987 overridden = true 988 } 989 } 990 // we allow other parts of the kubelet (namely eviction) to request this pod be terminated faster 991 if options != nil { 992 if override := options.PodTerminationGracePeriodSecondsOverride; override != nil { 993 if gracePeriod == 0 || *override < gracePeriod { 994 gracePeriod = *override 995 overridden = true 996 } 997 } 998 } 999 // make a best effort to default this value to the pod's desired intent, in the event 1000 // the kubelet provided no requested value (graceful termination?) 1001 if !overridden && gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil { 1002 gracePeriod = *pod.Spec.TerminationGracePeriodSeconds 1003 } 1004 // no matter what, we always supply a grace period of 1 1005 if gracePeriod < 1 { 1006 gracePeriod = 1 1007 } 1008 return gracePeriod, status.gracePeriod != 0 && status.gracePeriod != gracePeriod 1009 } 1010 1011 // allowPodStart tries to start the pod and returns true if allowed, otherwise 1012 // it requeues the pod and returns false. If the pod will never be able to start 1013 // because data is missing, or the pod was terminated before start, canEverStart 1014 // is false. This method can only be called while holding the pod lock. 1015 func (p *podWorkers) allowPodStart(pod *v1.Pod) (canStart bool, canEverStart bool) { 1016 if !kubetypes.IsStaticPod(pod) { 1017 // TODO: Do we want to allow non-static pods with the same full name? 1018 // Note that it may disable the force deletion of pods. 1019 return true, true 1020 } 1021 status, ok := p.podSyncStatuses[pod.UID] 1022 if !ok { 1023 klog.ErrorS(nil, "Pod sync status does not exist, the worker should not be running", "pod", klog.KObj(pod), "podUID", pod.UID) 1024 return false, false 1025 } 1026 if status.IsTerminationRequested() { 1027 return false, false 1028 } 1029 if !p.allowStaticPodStart(status.fullname, pod.UID) { 1030 p.workQueue.Enqueue(pod.UID, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) 1031 return false, true 1032 } 1033 return true, true 1034 } 1035 1036 // allowStaticPodStart tries to start the static pod and returns true if 1037 // 1. there are no other started static pods with the same fullname 1038 // 2. the uid matches that of the first valid static pod waiting to start 1039 func (p *podWorkers) allowStaticPodStart(fullname string, uid types.UID) bool { 1040 startedUID, started := p.startedStaticPodsByFullname[fullname] 1041 if started { 1042 return startedUID == uid 1043 } 1044 1045 waitingPods := p.waitingToStartStaticPodsByFullname[fullname] 1046 // TODO: This is O(N) with respect to the number of updates to static pods 1047 // with overlapping full names, and ideally would be O(1). 1048 for i, waitingUID := range waitingPods { 1049 // has pod already terminated or been deleted? 1050 status, ok := p.podSyncStatuses[waitingUID] 1051 if !ok || status.IsTerminationRequested() || status.IsTerminated() { 1052 continue 1053 } 1054 // another pod is next in line 1055 if waitingUID != uid { 1056 p.waitingToStartStaticPodsByFullname[fullname] = waitingPods[i:] 1057 return false 1058 } 1059 // we are up next, remove ourselves 1060 waitingPods = waitingPods[i+1:] 1061 break 1062 } 1063 if len(waitingPods) != 0 { 1064 p.waitingToStartStaticPodsByFullname[fullname] = waitingPods 1065 } else { 1066 delete(p.waitingToStartStaticPodsByFullname, fullname) 1067 } 1068 p.startedStaticPodsByFullname[fullname] = uid 1069 return true 1070 } 1071 1072 // cleanupUnstartedPod is invoked if a pod that has never been started receives a termination 1073 // signal before it can be started. This method must be called holding the pod lock. 1074 func (p *podWorkers) cleanupUnstartedPod(pod *v1.Pod, status *podSyncStatus) { 1075 p.cleanupPodUpdates(pod.UID) 1076 1077 if status.terminatingAt.IsZero() { 1078 klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) 1079 } 1080 if !status.terminatedAt.IsZero() { 1081 klog.V(4).InfoS("Pod worker is complete and had terminatedAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) 1082 } 1083 status.finished = true 1084 status.working = false 1085 status.terminatedAt = p.clock.Now() 1086 1087 if p.startedStaticPodsByFullname[status.fullname] == pod.UID { 1088 delete(p.startedStaticPodsByFullname, status.fullname) 1089 } 1090 } 1091 1092 // startPodSync is invoked by each pod worker goroutine when a message arrives on the pod update channel. 1093 // This method consumes a pending update, initializes a context, decides whether the pod is already started 1094 // or can be started, and updates the cached pod state so that downstream components can observe what the 1095 // pod worker goroutine is currently attempting to do. If ok is false, there is no available event. If any 1096 // of the boolean values is false, ensure the appropriate cleanup happens before returning. 1097 // 1098 // This method should ensure that either status.pendingUpdate is cleared and merged into status.activeUpdate, 1099 // or when a pod cannot be started status.pendingUpdate remains the same. Pods that have not been started 1100 // should never have an activeUpdate because that is exposed to downstream components on started pods. 1101 func (p *podWorkers) startPodSync(podUID types.UID) (ctx context.Context, update podWork, canStart, canEverStart, ok bool) { 1102 p.podLock.Lock() 1103 defer p.podLock.Unlock() 1104 1105 // verify we are known to the pod worker still 1106 status, ok := p.podSyncStatuses[podUID] 1107 if !ok { 1108 // pod status has disappeared, the worker should exit 1109 klog.V(4).InfoS("Pod worker no longer has status, worker should exit", "podUID", podUID) 1110 return nil, update, false, false, false 1111 } 1112 if !status.working { 1113 // working is used by unit tests to observe whether a worker is currently acting on this pod 1114 klog.V(4).InfoS("Pod should be marked as working by the pod worker, programmer error", "podUID", podUID) 1115 } 1116 if status.pendingUpdate == nil { 1117 // no update available, this means we were queued without work being added or there is a 1118 // race condition, both of which are unexpected 1119 status.working = false 1120 klog.V(4).InfoS("Pod worker received no pending work, programmer error?", "podUID", podUID) 1121 return nil, update, false, false, false 1122 } 1123 1124 // consume the pending update 1125 update.WorkType = status.WorkType() 1126 update.Options = *status.pendingUpdate 1127 status.pendingUpdate = nil 1128 select { 1129 case <-p.podUpdates[podUID]: 1130 // ensure the pod update channel is empty (it is only ever written to under lock) 1131 default: 1132 } 1133 1134 // initialize a context for the worker if one does not exist 1135 if status.ctx == nil || status.ctx.Err() == context.Canceled { 1136 status.ctx, status.cancelFn = context.WithCancel(context.Background()) 1137 } 1138 ctx = status.ctx 1139 1140 // if we are already started, make our state visible to downstream components 1141 if status.IsStarted() { 1142 status.mergeLastUpdate(update.Options) 1143 return ctx, update, true, true, true 1144 } 1145 1146 // if we are already terminating and we only have a running pod, allow the worker 1147 // to "start" since we are immediately moving to terminating 1148 if update.Options.RunningPod != nil && update.WorkType == TerminatingPod { 1149 status.mergeLastUpdate(update.Options) 1150 return ctx, update, true, true, true 1151 } 1152 1153 // If we receive an update where Pod is nil (running pod is set) but haven't 1154 // started yet, we can only terminate the pod, not start it. We should not be 1155 // asked to start such a pod, but guard here just in case an accident occurs. 1156 if update.Options.Pod == nil { 1157 status.mergeLastUpdate(update.Options) 1158 klog.V(4).InfoS("Running pod cannot start ever, programmer error", "pod", klog.KObj(update.Options.Pod), "podUID", podUID, "updateType", update.WorkType) 1159 return ctx, update, false, false, true 1160 } 1161 1162 // verify we can start 1163 canStart, canEverStart = p.allowPodStart(update.Options.Pod) 1164 switch { 1165 case !canEverStart: 1166 p.cleanupUnstartedPod(update.Options.Pod, status) 1167 status.working = false 1168 if start := update.Options.StartTime; !start.IsZero() { 1169 metrics.PodWorkerDuration.WithLabelValues("terminated").Observe(metrics.SinceInSeconds(start)) 1170 } 1171 klog.V(4).InfoS("Pod cannot start ever", "pod", klog.KObj(update.Options.Pod), "podUID", podUID, "updateType", update.WorkType) 1172 return ctx, update, canStart, canEverStart, true 1173 case !canStart: 1174 // this is the only path we don't start the pod, so we need to put the change back in pendingUpdate 1175 status.pendingUpdate = &update.Options 1176 status.working = false 1177 klog.V(4).InfoS("Pod cannot start yet", "pod", klog.KObj(update.Options.Pod), "podUID", podUID) 1178 return ctx, update, canStart, canEverStart, true 1179 } 1180 1181 // mark the pod as started 1182 status.startedAt = p.clock.Now() 1183 status.mergeLastUpdate(update.Options) 1184 1185 // If we are admitting the pod and it is new, record the count of containers 1186 // TODO: We should probably move this into syncPod and add an execution count 1187 // to the syncPod arguments, and this should be recorded on the first sync. 1188 // Leaving it here complicates a particularly important loop. 1189 metrics.ContainersPerPodCount.Observe(float64(len(update.Options.Pod.Spec.Containers))) 1190 1191 return ctx, update, true, true, true 1192 } 1193 1194 func podUIDAndRefForUpdate(update UpdatePodOptions) (types.UID, klog.ObjectRef) { 1195 if update.RunningPod != nil { 1196 return update.RunningPod.ID, klog.KObj(update.RunningPod.ToAPIPod()) 1197 } 1198 return update.Pod.UID, klog.KObj(update.Pod) 1199 } 1200 1201 // podWorkerLoop manages sequential state updates to a pod in a goroutine, exiting once the final 1202 // state is reached. The loop is responsible for driving the pod through four main phases: 1203 // 1204 // 1. Wait to start, guaranteeing no two pods with the same UID or same fullname are running at the same time 1205 // 2. Sync, orchestrating pod setup by reconciling the desired pod spec with the runtime state of the pod 1206 // 3. Terminating, ensuring all running containers in the pod are stopped 1207 // 4. Terminated, cleaning up any resources that must be released before the pod can be deleted 1208 // 1209 // The podWorkerLoop is driven by updates delivered to UpdatePod and by SyncKnownPods. If a particular 1210 // sync method fails, p.workerQueue is updated with backoff but it is the responsibility of the kubelet 1211 // to trigger new UpdatePod calls. SyncKnownPods will only retry pods that are no longer known to the 1212 // caller. When a pod transitions working->terminating or terminating->terminated, the next update is 1213 // queued immediately and no kubelet action is required. 1214 func (p *podWorkers) podWorkerLoop(podUID types.UID, podUpdates <-chan struct{}) { 1215 var lastSyncTime time.Time 1216 for range podUpdates { 1217 ctx, update, canStart, canEverStart, ok := p.startPodSync(podUID) 1218 // If we had no update waiting, it means someone initialized the channel without filling out pendingUpdate. 1219 if !ok { 1220 continue 1221 } 1222 // If the pod was terminated prior to the pod being allowed to start, we exit the loop. 1223 if !canEverStart { 1224 return 1225 } 1226 // If the pod is not yet ready to start, continue and wait for more updates. 1227 if !canStart { 1228 continue 1229 } 1230 1231 podUID, podRef := podUIDAndRefForUpdate(update.Options) 1232 1233 klog.V(4).InfoS("Processing pod event", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) 1234 var isTerminal bool 1235 err := func() error { 1236 // The worker is responsible for ensuring the sync method sees the appropriate 1237 // status updates on resyncs (the result of the last sync), transitions to 1238 // terminating (no wait), or on terminated (whatever the most recent state is). 1239 // Only syncing and terminating can generate pod status changes, while terminated 1240 // pods ensure the most recent status makes it to the api server. 1241 var status *kubecontainer.PodStatus 1242 var err error 1243 switch { 1244 case update.Options.RunningPod != nil: 1245 // when we receive a running pod, we don't need status at all because we are 1246 // guaranteed to be terminating and we skip updates to the pod 1247 default: 1248 // wait until we see the next refresh from the PLEG via the cache (max 2s) 1249 // TODO: this adds ~1s of latency on all transitions from sync to terminating 1250 // to terminated, and on all termination retries (including evictions). We should 1251 // improve latency by making the pleg continuous and by allowing pod status 1252 // changes to be refreshed when key events happen (killPod, sync->terminating). 1253 // Improving this latency also reduces the possibility that a terminated 1254 // container's status is garbage collected before we have a chance to update the 1255 // API server (thus losing the exit code). 1256 status, err = p.podCache.GetNewerThan(update.Options.Pod.UID, lastSyncTime) 1257 1258 if err != nil { 1259 // This is the legacy event thrown by manage pod loop all other events are now dispatched 1260 // from syncPodFn 1261 p.recorder.Eventf(update.Options.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) 1262 return err 1263 } 1264 } 1265 1266 // Take the appropriate action (illegal phases are prevented by UpdatePod) 1267 switch { 1268 case update.WorkType == TerminatedPod: 1269 err = p.podSyncer.SyncTerminatedPod(ctx, update.Options.Pod, status) 1270 1271 case update.WorkType == TerminatingPod: 1272 var gracePeriod *int64 1273 if opt := update.Options.KillPodOptions; opt != nil { 1274 gracePeriod = opt.PodTerminationGracePeriodSecondsOverride 1275 } 1276 podStatusFn := p.acknowledgeTerminating(podUID) 1277 1278 // if we only have a running pod, terminate it directly 1279 if update.Options.RunningPod != nil { 1280 err = p.podSyncer.SyncTerminatingRuntimePod(ctx, update.Options.RunningPod) 1281 } else { 1282 err = p.podSyncer.SyncTerminatingPod(ctx, update.Options.Pod, status, gracePeriod, podStatusFn) 1283 } 1284 1285 default: 1286 isTerminal, err = p.podSyncer.SyncPod(ctx, update.Options.UpdateType, update.Options.Pod, update.Options.MirrorPod, status) 1287 } 1288 1289 lastSyncTime = p.clock.Now() 1290 return err 1291 }() 1292 1293 var phaseTransition bool 1294 switch { 1295 case err == context.Canceled: 1296 // when the context is cancelled we expect an update to already be queued 1297 klog.V(2).InfoS("Sync exited with context cancellation error", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) 1298 1299 case err != nil: 1300 // we will queue a retry 1301 klog.ErrorS(err, "Error syncing pod, skipping", "pod", podRef, "podUID", podUID) 1302 1303 case update.WorkType == TerminatedPod: 1304 // we can shut down the worker 1305 p.completeTerminated(podUID) 1306 if start := update.Options.StartTime; !start.IsZero() { 1307 metrics.PodWorkerDuration.WithLabelValues("terminated").Observe(metrics.SinceInSeconds(start)) 1308 } 1309 klog.V(4).InfoS("Processing pod event done", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) 1310 return 1311 1312 case update.WorkType == TerminatingPod: 1313 // pods that don't exist in config don't need to be terminated, other loops will clean them up 1314 if update.Options.RunningPod != nil { 1315 p.completeTerminatingRuntimePod(podUID) 1316 if start := update.Options.StartTime; !start.IsZero() { 1317 metrics.PodWorkerDuration.WithLabelValues(update.Options.UpdateType.String()).Observe(metrics.SinceInSeconds(start)) 1318 } 1319 klog.V(4).InfoS("Processing pod event done", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) 1320 return 1321 } 1322 // otherwise we move to the terminating phase 1323 p.completeTerminating(podUID) 1324 phaseTransition = true 1325 1326 case isTerminal: 1327 // if syncPod indicated we are now terminal, set the appropriate pod status to move to terminating 1328 klog.V(4).InfoS("Pod is terminal", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) 1329 p.completeSync(podUID) 1330 phaseTransition = true 1331 } 1332 1333 // queue a retry if necessary, then put the next event in the channel if any 1334 p.completeWork(podUID, phaseTransition, err) 1335 if start := update.Options.StartTime; !start.IsZero() { 1336 metrics.PodWorkerDuration.WithLabelValues(update.Options.UpdateType.String()).Observe(metrics.SinceInSeconds(start)) 1337 } 1338 klog.V(4).InfoS("Processing pod event done", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) 1339 } 1340 } 1341 1342 // acknowledgeTerminating sets the terminating flag on the pod status once the pod worker sees 1343 // the termination state so that other components know no new containers will be started in this 1344 // pod. It then returns the status function, if any, that applies to this pod. 1345 func (p *podWorkers) acknowledgeTerminating(podUID types.UID) PodStatusFunc { 1346 p.podLock.Lock() 1347 defer p.podLock.Unlock() 1348 1349 status, ok := p.podSyncStatuses[podUID] 1350 if !ok { 1351 return nil 1352 } 1353 1354 if !status.terminatingAt.IsZero() && !status.startedTerminating { 1355 klog.V(4).InfoS("Pod worker has observed request to terminate", "podUID", podUID) 1356 status.startedTerminating = true 1357 } 1358 1359 if l := len(status.statusPostTerminating); l > 0 { 1360 return status.statusPostTerminating[l-1] 1361 } 1362 return nil 1363 } 1364 1365 // completeSync is invoked when syncPod completes successfully and indicates the pod is now terminal and should 1366 // be terminated. This happens when the natural pod lifecycle completes - any pod which is not RestartAlways 1367 // exits. Unnatural completions, such as evictions, API driven deletion or phase transition, are handled by 1368 // UpdatePod. 1369 func (p *podWorkers) completeSync(podUID types.UID) { 1370 p.podLock.Lock() 1371 defer p.podLock.Unlock() 1372 1373 klog.V(4).InfoS("Pod indicated lifecycle completed naturally and should now terminate", "podUID", podUID) 1374 1375 status, ok := p.podSyncStatuses[podUID] 1376 if !ok { 1377 klog.V(4).InfoS("Pod had no status in completeSync, programmer error?", "podUID", podUID) 1378 return 1379 } 1380 1381 // update the status of the pod 1382 if status.terminatingAt.IsZero() { 1383 status.terminatingAt = p.clock.Now() 1384 } else { 1385 klog.V(4).InfoS("Pod worker attempted to set terminatingAt twice, likely programmer error", "podUID", podUID) 1386 } 1387 status.startedTerminating = true 1388 1389 // the pod has now transitioned to terminating and we want to run syncTerminatingPod 1390 // as soon as possible, so if no update is already waiting queue a synthetic update 1391 p.requeueLastPodUpdate(podUID, status) 1392 } 1393 1394 // completeTerminating is invoked when syncTerminatingPod completes successfully, which means 1395 // no container is running, no container will be started in the future, and we are ready for 1396 // cleanup. This updates the termination state which prevents future syncs and will ensure 1397 // other kubelet loops know this pod is not running any containers. 1398 func (p *podWorkers) completeTerminating(podUID types.UID) { 1399 p.podLock.Lock() 1400 defer p.podLock.Unlock() 1401 1402 klog.V(4).InfoS("Pod terminated all containers successfully", "podUID", podUID) 1403 1404 status, ok := p.podSyncStatuses[podUID] 1405 if !ok { 1406 return 1407 } 1408 1409 // update the status of the pod 1410 if status.terminatingAt.IsZero() { 1411 klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "podUID", podUID) 1412 } 1413 status.terminatedAt = p.clock.Now() 1414 for _, ch := range status.notifyPostTerminating { 1415 close(ch) 1416 } 1417 status.notifyPostTerminating = nil 1418 status.statusPostTerminating = nil 1419 1420 // the pod has now transitioned to terminated and we want to run syncTerminatedPod 1421 // as soon as possible, so if no update is already waiting queue a synthetic update 1422 p.requeueLastPodUpdate(podUID, status) 1423 } 1424 1425 // completeTerminatingRuntimePod is invoked when syncTerminatingPod completes successfully, 1426 // which means an orphaned pod (no config) is terminated and we can exit. Since orphaned 1427 // pods have no API representation, we want to exit the loop at this point and ensure no 1428 // status is present afterwards - the running pod is truly terminated when this is invoked. 1429 func (p *podWorkers) completeTerminatingRuntimePod(podUID types.UID) { 1430 p.podLock.Lock() 1431 defer p.podLock.Unlock() 1432 1433 klog.V(4).InfoS("Pod terminated all orphaned containers successfully and worker can now stop", "podUID", podUID) 1434 1435 p.cleanupPodUpdates(podUID) 1436 1437 status, ok := p.podSyncStatuses[podUID] 1438 if !ok { 1439 return 1440 } 1441 if status.terminatingAt.IsZero() { 1442 klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "podUID", podUID) 1443 } 1444 status.terminatedAt = p.clock.Now() 1445 status.finished = true 1446 status.working = false 1447 1448 if p.startedStaticPodsByFullname[status.fullname] == podUID { 1449 delete(p.startedStaticPodsByFullname, status.fullname) 1450 } 1451 1452 // A runtime pod is transient and not part of the desired state - once it has reached 1453 // terminated we can abandon tracking it. 1454 delete(p.podSyncStatuses, podUID) 1455 } 1456 1457 // completeTerminated is invoked after syncTerminatedPod completes successfully and means we 1458 // can stop the pod worker. The pod is finalized at this point. 1459 func (p *podWorkers) completeTerminated(podUID types.UID) { 1460 p.podLock.Lock() 1461 defer p.podLock.Unlock() 1462 1463 klog.V(4).InfoS("Pod is complete and the worker can now stop", "podUID", podUID) 1464 1465 p.cleanupPodUpdates(podUID) 1466 1467 status, ok := p.podSyncStatuses[podUID] 1468 if !ok { 1469 return 1470 } 1471 if status.terminatingAt.IsZero() { 1472 klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "podUID", podUID) 1473 } 1474 if status.terminatedAt.IsZero() { 1475 klog.V(4).InfoS("Pod worker is complete but did not have terminatedAt set, likely programmer error", "podUID", podUID) 1476 } 1477 status.finished = true 1478 status.working = false 1479 1480 if p.startedStaticPodsByFullname[status.fullname] == podUID { 1481 delete(p.startedStaticPodsByFullname, status.fullname) 1482 } 1483 } 1484 1485 // completeWork requeues on error or the next sync interval and then immediately executes any pending 1486 // work. 1487 func (p *podWorkers) completeWork(podUID types.UID, phaseTransition bool, syncErr error) { 1488 // Requeue the last update if the last sync returned error. 1489 switch { 1490 case phaseTransition: 1491 p.workQueue.Enqueue(podUID, 0) 1492 case syncErr == nil: 1493 // No error; requeue at the regular resync interval. 1494 p.workQueue.Enqueue(podUID, wait.Jitter(p.resyncInterval, workerResyncIntervalJitterFactor)) 1495 case strings.Contains(syncErr.Error(), NetworkNotReadyErrorMsg): 1496 // Network is not ready; back off for short period of time and retry as network might be ready soon. 1497 p.workQueue.Enqueue(podUID, wait.Jitter(backOffOnTransientErrorPeriod, workerBackOffPeriodJitterFactor)) 1498 default: 1499 // Error occurred during the sync; back off and then retry. 1500 p.workQueue.Enqueue(podUID, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) 1501 } 1502 1503 // if there is a pending update for this worker, requeue immediately, otherwise 1504 // clear working status 1505 p.podLock.Lock() 1506 defer p.podLock.Unlock() 1507 if status, ok := p.podSyncStatuses[podUID]; ok { 1508 if status.pendingUpdate != nil { 1509 select { 1510 case p.podUpdates[podUID] <- struct{}{}: 1511 klog.V(4).InfoS("Requeueing pod due to pending update", "podUID", podUID) 1512 default: 1513 klog.V(4).InfoS("Pending update already queued", "podUID", podUID) 1514 } 1515 } else { 1516 status.working = false 1517 } 1518 } 1519 } 1520 1521 // SyncKnownPods will purge any fully terminated pods that are not in the desiredPods 1522 // list, which means SyncKnownPods must be called in a threadsafe manner from calls 1523 // to UpdatePods for new pods. Because the podworker is dependent on UpdatePod being 1524 // invoked to drive a pod's state machine, if a pod is missing in the desired list the 1525 // pod worker must be responsible for delivering that update. The method returns a map 1526 // of known workers that are not finished with a value of SyncPodTerminated, 1527 // SyncPodKill, or SyncPodSync depending on whether the pod is terminated, terminating, 1528 // or syncing. 1529 func (p *podWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkerSync { 1530 workers := make(map[types.UID]PodWorkerSync) 1531 known := make(map[types.UID]struct{}) 1532 for _, pod := range desiredPods { 1533 known[pod.UID] = struct{}{} 1534 } 1535 1536 p.podLock.Lock() 1537 defer p.podLock.Unlock() 1538 1539 p.podsSynced = true 1540 for uid, status := range p.podSyncStatuses { 1541 // We retain the worker history of any pod that is still desired according to 1542 // its UID. However, there are two scenarios during a sync that result in us 1543 // needing to purge the history: 1544 // 1545 // 1. The pod is no longer desired (the local version is orphaned) 1546 // 2. The pod received a kill update and then a subsequent create, which means 1547 // the UID was reused in the source config (vanishingly rare for API servers, 1548 // common for static pods that have specified a fixed UID) 1549 // 1550 // In the former case we wish to bound the amount of information we store for 1551 // deleted pods. In the latter case we wish to minimize the amount of time before 1552 // we restart the static pod. If we succeed at removing the worker, then we 1553 // omit it from the returned map of known workers, and the caller of SyncKnownPods 1554 // is expected to send a new UpdatePod({UpdateType: Create}). 1555 _, knownPod := known[uid] 1556 orphan := !knownPod 1557 if status.restartRequested || orphan { 1558 if p.removeTerminatedWorker(uid, status, orphan) { 1559 // no worker running, we won't return it 1560 continue 1561 } 1562 } 1563 1564 sync := PodWorkerSync{ 1565 State: status.WorkType(), 1566 Orphan: orphan, 1567 } 1568 switch { 1569 case status.activeUpdate != nil: 1570 if status.activeUpdate.Pod != nil { 1571 sync.HasConfig = true 1572 sync.Static = kubetypes.IsStaticPod(status.activeUpdate.Pod) 1573 } 1574 case status.pendingUpdate != nil: 1575 if status.pendingUpdate.Pod != nil { 1576 sync.HasConfig = true 1577 sync.Static = kubetypes.IsStaticPod(status.pendingUpdate.Pod) 1578 } 1579 } 1580 workers[uid] = sync 1581 } 1582 return workers 1583 } 1584 1585 // removeTerminatedWorker cleans up and removes the worker status for a worker 1586 // that has reached a terminal state of "finished" - has successfully exited 1587 // syncTerminatedPod. This "forgets" a pod by UID and allows another pod to be 1588 // recreated with the same UID. The kubelet preserves state about recently 1589 // terminated pods to prevent accidentally restarting a terminal pod, which is 1590 // proportional to the number of pods described in the pod config. The method 1591 // returns true if the worker was completely removed. 1592 func (p *podWorkers) removeTerminatedWorker(uid types.UID, status *podSyncStatus, orphaned bool) bool { 1593 if !status.finished { 1594 // If the pod worker has not reached terminal state and the pod is still known, we wait. 1595 if !orphaned { 1596 klog.V(4).InfoS("Pod worker has been requested for removal but is still not fully terminated", "podUID", uid) 1597 return false 1598 } 1599 1600 // all orphaned pods are considered deleted 1601 status.deleted = true 1602 1603 // When a pod is no longer in the desired set, the pod is considered orphaned and the 1604 // the pod worker becomes responsible for driving the pod to completion (there is no 1605 // guarantee another component will notify us of updates). 1606 switch { 1607 case !status.IsStarted() && !status.observedRuntime: 1608 // The pod has not been started, which means we can safely clean up the pod - the 1609 // pod worker will shutdown as a result of this change without executing a sync. 1610 klog.V(4).InfoS("Pod is orphaned and has not been started", "podUID", uid) 1611 case !status.IsTerminationRequested(): 1612 // The pod has been started but termination has not been requested - set the appropriate 1613 // timestamp and notify the pod worker. Because the pod has been synced at least once, 1614 // the value of status.activeUpdate will be the fallback for the next sync. 1615 status.terminatingAt = p.clock.Now() 1616 if status.activeUpdate != nil && status.activeUpdate.Pod != nil { 1617 status.gracePeriod, _ = calculateEffectiveGracePeriod(status, status.activeUpdate.Pod, nil) 1618 } else { 1619 status.gracePeriod = 1 1620 } 1621 p.requeueLastPodUpdate(uid, status) 1622 klog.V(4).InfoS("Pod is orphaned and still running, began terminating", "podUID", uid) 1623 return false 1624 default: 1625 // The pod is already moving towards termination, notify the pod worker. Because the pod 1626 // has been synced at least once, the value of status.activeUpdate will be the fallback for 1627 // the next sync. 1628 p.requeueLastPodUpdate(uid, status) 1629 klog.V(4).InfoS("Pod is orphaned and still terminating, notified the pod worker", "podUID", uid) 1630 return false 1631 } 1632 } 1633 1634 if status.restartRequested { 1635 klog.V(4).InfoS("Pod has been terminated but another pod with the same UID was created, remove history to allow restart", "podUID", uid) 1636 } else { 1637 klog.V(4).InfoS("Pod has been terminated and is no longer known to the kubelet, remove all history", "podUID", uid) 1638 } 1639 delete(p.podSyncStatuses, uid) 1640 p.cleanupPodUpdates(uid) 1641 1642 if p.startedStaticPodsByFullname[status.fullname] == uid { 1643 delete(p.startedStaticPodsByFullname, status.fullname) 1644 } 1645 return true 1646 } 1647 1648 // killPodNow returns a KillPodFunc that can be used to kill a pod. 1649 // It is intended to be injected into other modules that need to kill a pod. 1650 func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.KillPodFunc { 1651 return func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error { 1652 // determine the grace period to use when killing the pod 1653 gracePeriod := int64(0) 1654 if gracePeriodOverride != nil { 1655 gracePeriod = *gracePeriodOverride 1656 } else if pod.Spec.TerminationGracePeriodSeconds != nil { 1657 gracePeriod = *pod.Spec.TerminationGracePeriodSeconds 1658 } 1659 1660 // we timeout and return an error if we don't get a callback within a reasonable time. 1661 // the default timeout is relative to the grace period (we settle on 10s to wait for kubelet->runtime traffic to complete in sigkill) 1662 timeout := gracePeriod + (gracePeriod / 2) 1663 minTimeout := int64(10) 1664 if timeout < minTimeout { 1665 timeout = minTimeout 1666 } 1667 timeoutDuration := time.Duration(timeout) * time.Second 1668 1669 // open a channel we block against until we get a result 1670 ch := make(chan struct{}, 1) 1671 podWorkers.UpdatePod(UpdatePodOptions{ 1672 Pod: pod, 1673 UpdateType: kubetypes.SyncPodKill, 1674 KillPodOptions: &KillPodOptions{ 1675 CompletedCh: ch, 1676 Evict: isEvicted, 1677 PodStatusFunc: statusFn, 1678 PodTerminationGracePeriodSecondsOverride: gracePeriodOverride, 1679 }, 1680 }) 1681 1682 // wait for either a response, or a timeout 1683 select { 1684 case <-ch: 1685 return nil 1686 case <-time.After(timeoutDuration): 1687 recorder.Eventf(pod, v1.EventTypeWarning, events.ExceededGracePeriod, "Container runtime did not kill the pod within specified grace period.") 1688 return fmt.Errorf("timeout waiting to kill pod") 1689 } 1690 } 1691 } 1692 1693 // cleanupPodUpdates closes the podUpdates channel and removes it from 1694 // podUpdates map so that the corresponding pod worker can stop. It also 1695 // removes any undelivered work. This method must be called holding the 1696 // pod lock. 1697 func (p *podWorkers) cleanupPodUpdates(uid types.UID) { 1698 if ch, ok := p.podUpdates[uid]; ok { 1699 close(ch) 1700 } 1701 delete(p.podUpdates, uid) 1702 } 1703 1704 // requeueLastPodUpdate creates a new pending pod update from the most recently 1705 // executed update if no update is already queued, and then notifies the pod 1706 // worker goroutine of the update. This method must be called while holding 1707 // the pod lock. 1708 func (p *podWorkers) requeueLastPodUpdate(podUID types.UID, status *podSyncStatus) { 1709 // if there is already an update queued, we can use that instead, or if 1710 // we have no previously executed update, we cannot replay it. 1711 if status.pendingUpdate != nil || status.activeUpdate == nil { 1712 return 1713 } 1714 copied := *status.activeUpdate 1715 status.pendingUpdate = &copied 1716 1717 // notify the pod worker 1718 status.working = true 1719 select { 1720 case p.podUpdates[podUID] <- struct{}{}: 1721 default: 1722 } 1723 }