k8s.io/kubernetes@v1.29.3/pkg/kubelet/kuberuntime/kuberuntime_gc.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kuberuntime 18 19 import ( 20 "context" 21 "fmt" 22 "os" 23 "path/filepath" 24 "sort" 25 "time" 26 27 "go.opentelemetry.io/otel/trace" 28 "k8s.io/apimachinery/pkg/types" 29 utilerrors "k8s.io/apimachinery/pkg/util/errors" 30 "k8s.io/apimachinery/pkg/util/sets" 31 internalapi "k8s.io/cri-api/pkg/apis" 32 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 33 "k8s.io/klog/v2" 34 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 35 ) 36 37 // containerGC is the manager of garbage collection. 38 type containerGC struct { 39 client internalapi.RuntimeService 40 manager *kubeGenericRuntimeManager 41 podStateProvider podStateProvider 42 tracer trace.Tracer 43 } 44 45 // NewContainerGC creates a new containerGC. 46 func newContainerGC(client internalapi.RuntimeService, podStateProvider podStateProvider, manager *kubeGenericRuntimeManager, tracer trace.Tracer) *containerGC { 47 return &containerGC{ 48 client: client, 49 manager: manager, 50 podStateProvider: podStateProvider, 51 tracer: tracer, 52 } 53 } 54 55 // containerGCInfo is the internal information kept for containers being considered for GC. 56 type containerGCInfo struct { 57 // The ID of the container. 58 id string 59 // The name of the container. 60 name string 61 // Creation time for the container. 62 createTime time.Time 63 // If true, the container is in unknown state. Garbage collector should try 64 // to stop containers before removal. 65 unknown bool 66 } 67 68 // sandboxGCInfo is the internal information kept for sandboxes being considered for GC. 69 type sandboxGCInfo struct { 70 // The ID of the sandbox. 71 id string 72 // Creation time for the sandbox. 73 createTime time.Time 74 // If true, the sandbox is ready or still has containers. 75 active bool 76 } 77 78 // evictUnit is considered for eviction as units of (UID, container name) pair. 79 type evictUnit struct { 80 // UID of the pod. 81 uid types.UID 82 // Name of the container in the pod. 83 name string 84 } 85 86 type containersByEvictUnit map[evictUnit][]containerGCInfo 87 type sandboxesByPodUID map[types.UID][]sandboxGCInfo 88 89 // NumContainers returns the number of containers in this map. 90 func (cu containersByEvictUnit) NumContainers() int { 91 num := 0 92 for key := range cu { 93 num += len(cu[key]) 94 } 95 return num 96 } 97 98 // NumEvictUnits returns the number of pod in this map. 99 func (cu containersByEvictUnit) NumEvictUnits() int { 100 return len(cu) 101 } 102 103 // Newest first. 104 type byCreated []containerGCInfo 105 106 func (a byCreated) Len() int { return len(a) } 107 func (a byCreated) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 108 func (a byCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) } 109 110 // Newest first. 111 type sandboxByCreated []sandboxGCInfo 112 113 func (a sandboxByCreated) Len() int { return len(a) } 114 func (a sandboxByCreated) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 115 func (a sandboxByCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) } 116 117 // enforceMaxContainersPerEvictUnit enforces MaxPerPodContainer for each evictUnit. 118 func (cgc *containerGC) enforceMaxContainersPerEvictUnit(ctx context.Context, evictUnits containersByEvictUnit, MaxContainers int) { 119 for key := range evictUnits { 120 toRemove := len(evictUnits[key]) - MaxContainers 121 122 if toRemove > 0 { 123 evictUnits[key] = cgc.removeOldestN(ctx, evictUnits[key], toRemove) 124 } 125 } 126 } 127 128 // removeOldestN removes the oldest toRemove containers and returns the resulting slice. 129 func (cgc *containerGC) removeOldestN(ctx context.Context, containers []containerGCInfo, toRemove int) []containerGCInfo { 130 // Remove from oldest to newest (last to first). 131 numToKeep := len(containers) - toRemove 132 if numToKeep > 0 { 133 sort.Sort(byCreated(containers)) 134 } 135 for i := len(containers) - 1; i >= numToKeep; i-- { 136 if containers[i].unknown { 137 // Containers in known state could be running, we should try 138 // to stop it before removal. 139 id := kubecontainer.ContainerID{ 140 Type: cgc.manager.runtimeName, 141 ID: containers[i].id, 142 } 143 message := "Container is in unknown state, try killing it before removal" 144 if err := cgc.manager.killContainer(ctx, nil, id, containers[i].name, message, reasonUnknown, nil, nil); err != nil { 145 klog.ErrorS(err, "Failed to stop container", "containerID", containers[i].id) 146 continue 147 } 148 } 149 if err := cgc.manager.removeContainer(ctx, containers[i].id); err != nil { 150 klog.ErrorS(err, "Failed to remove container", "containerID", containers[i].id) 151 } 152 } 153 154 // Assume we removed the containers so that we're not too aggressive. 155 return containers[:numToKeep] 156 } 157 158 // removeOldestNSandboxes removes the oldest inactive toRemove sandboxes and 159 // returns the resulting slice. 160 func (cgc *containerGC) removeOldestNSandboxes(ctx context.Context, sandboxes []sandboxGCInfo, toRemove int) { 161 numToKeep := len(sandboxes) - toRemove 162 if numToKeep > 0 { 163 sort.Sort(sandboxByCreated(sandboxes)) 164 } 165 // Remove from oldest to newest (last to first). 166 for i := len(sandboxes) - 1; i >= numToKeep; i-- { 167 if !sandboxes[i].active { 168 cgc.removeSandbox(ctx, sandboxes[i].id) 169 } 170 } 171 } 172 173 // removeSandbox removes the sandbox by sandboxID. 174 func (cgc *containerGC) removeSandbox(ctx context.Context, sandboxID string) { 175 klog.V(4).InfoS("Removing sandbox", "sandboxID", sandboxID) 176 // In normal cases, kubelet should've already called StopPodSandbox before 177 // GC kicks in. To guard against the rare cases where this is not true, try 178 // stopping the sandbox before removing it. 179 if err := cgc.client.StopPodSandbox(ctx, sandboxID); err != nil { 180 klog.ErrorS(err, "Failed to stop sandbox before removing", "sandboxID", sandboxID) 181 return 182 } 183 if err := cgc.client.RemovePodSandbox(ctx, sandboxID); err != nil { 184 klog.ErrorS(err, "Failed to remove sandbox", "sandboxID", sandboxID) 185 } 186 } 187 188 // evictableContainers gets all containers that are evictable. Evictable containers are: not running 189 // and created more than MinAge ago. 190 func (cgc *containerGC) evictableContainers(ctx context.Context, minAge time.Duration) (containersByEvictUnit, error) { 191 containers, err := cgc.manager.getKubeletContainers(ctx, true) 192 if err != nil { 193 return containersByEvictUnit{}, err 194 } 195 196 evictUnits := make(containersByEvictUnit) 197 newestGCTime := time.Now().Add(-minAge) 198 for _, container := range containers { 199 // Prune out running containers. 200 if container.State == runtimeapi.ContainerState_CONTAINER_RUNNING { 201 continue 202 } 203 204 createdAt := time.Unix(0, container.CreatedAt) 205 if newestGCTime.Before(createdAt) { 206 continue 207 } 208 209 labeledInfo := getContainerInfoFromLabels(container.Labels) 210 containerInfo := containerGCInfo{ 211 id: container.Id, 212 name: container.Metadata.Name, 213 createTime: createdAt, 214 unknown: container.State == runtimeapi.ContainerState_CONTAINER_UNKNOWN, 215 } 216 key := evictUnit{ 217 uid: labeledInfo.PodUID, 218 name: containerInfo.name, 219 } 220 evictUnits[key] = append(evictUnits[key], containerInfo) 221 } 222 223 return evictUnits, nil 224 } 225 226 // evict all containers that are evictable 227 func (cgc *containerGC) evictContainers(ctx context.Context, gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error { 228 // Separate containers by evict units. 229 evictUnits, err := cgc.evictableContainers(ctx, gcPolicy.MinAge) 230 if err != nil { 231 return err 232 } 233 234 // Remove deleted pod containers if all sources are ready. 235 if allSourcesReady { 236 for key, unit := range evictUnits { 237 if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) { 238 cgc.removeOldestN(ctx, unit, len(unit)) // Remove all. 239 delete(evictUnits, key) 240 } 241 } 242 } 243 244 // Enforce max containers per evict unit. 245 if gcPolicy.MaxPerPodContainer >= 0 { 246 cgc.enforceMaxContainersPerEvictUnit(ctx, evictUnits, gcPolicy.MaxPerPodContainer) 247 } 248 249 // Enforce max total number of containers. 250 if gcPolicy.MaxContainers >= 0 && evictUnits.NumContainers() > gcPolicy.MaxContainers { 251 // Leave an equal number of containers per evict unit (min: 1). 252 numContainersPerEvictUnit := gcPolicy.MaxContainers / evictUnits.NumEvictUnits() 253 if numContainersPerEvictUnit < 1 { 254 numContainersPerEvictUnit = 1 255 } 256 cgc.enforceMaxContainersPerEvictUnit(ctx, evictUnits, numContainersPerEvictUnit) 257 258 // If we still need to evict, evict oldest first. 259 numContainers := evictUnits.NumContainers() 260 if numContainers > gcPolicy.MaxContainers { 261 flattened := make([]containerGCInfo, 0, numContainers) 262 for key := range evictUnits { 263 flattened = append(flattened, evictUnits[key]...) 264 } 265 sort.Sort(byCreated(flattened)) 266 267 cgc.removeOldestN(ctx, flattened, numContainers-gcPolicy.MaxContainers) 268 } 269 } 270 return nil 271 } 272 273 // evictSandboxes remove all evictable sandboxes. An evictable sandbox must 274 // meet the following requirements: 275 // 1. not in ready state 276 // 2. contains no containers. 277 // 3. belong to a non-existent (i.e., already removed) pod, or is not the 278 // most recently created sandbox for the pod. 279 func (cgc *containerGC) evictSandboxes(ctx context.Context, evictNonDeletedPods bool) error { 280 containers, err := cgc.manager.getKubeletContainers(ctx, true) 281 if err != nil { 282 return err 283 } 284 285 sandboxes, err := cgc.manager.getKubeletSandboxes(ctx, true) 286 if err != nil { 287 return err 288 } 289 290 // collect all the PodSandboxId of container 291 sandboxIDs := sets.NewString() 292 for _, container := range containers { 293 sandboxIDs.Insert(container.PodSandboxId) 294 } 295 296 sandboxesByPod := make(sandboxesByPodUID, len(sandboxes)) 297 for _, sandbox := range sandboxes { 298 podUID := types.UID(sandbox.Metadata.Uid) 299 sandboxInfo := sandboxGCInfo{ 300 id: sandbox.Id, 301 createTime: time.Unix(0, sandbox.CreatedAt), 302 } 303 304 // Set ready sandboxes and sandboxes that still have containers to be active. 305 if sandbox.State == runtimeapi.PodSandboxState_SANDBOX_READY || sandboxIDs.Has(sandbox.Id) { 306 sandboxInfo.active = true 307 } 308 309 sandboxesByPod[podUID] = append(sandboxesByPod[podUID], sandboxInfo) 310 } 311 312 for podUID, sandboxes := range sandboxesByPod { 313 if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) { 314 // Remove all evictable sandboxes if the pod has been removed. 315 // Note that the latest dead sandbox is also removed if there is 316 // already an active one. 317 cgc.removeOldestNSandboxes(ctx, sandboxes, len(sandboxes)) 318 } else { 319 // Keep latest one if the pod still exists. 320 cgc.removeOldestNSandboxes(ctx, sandboxes, len(sandboxes)-1) 321 } 322 } 323 return nil 324 } 325 326 // evictPodLogsDirectories evicts all evictable pod logs directories. Pod logs directories 327 // are evictable if there are no corresponding pods. 328 func (cgc *containerGC) evictPodLogsDirectories(ctx context.Context, allSourcesReady bool) error { 329 osInterface := cgc.manager.osInterface 330 if allSourcesReady { 331 // Only remove pod logs directories when all sources are ready. 332 dirs, err := osInterface.ReadDir(podLogsRootDirectory) 333 if err != nil { 334 return fmt.Errorf("failed to read podLogsRootDirectory %q: %v", podLogsRootDirectory, err) 335 } 336 for _, dir := range dirs { 337 name := dir.Name() 338 podUID := parsePodUIDFromLogsDirectory(name) 339 if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) { 340 continue 341 } 342 klog.V(4).InfoS("Removing pod logs", "podUID", podUID) 343 err := osInterface.RemoveAll(filepath.Join(podLogsRootDirectory, name)) 344 if err != nil { 345 klog.ErrorS(err, "Failed to remove pod logs directory", "path", name) 346 } 347 } 348 } 349 350 // Remove dead container log symlinks. 351 // TODO(random-liu): Remove this after cluster logging supports CRI container log path. 352 logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix))) 353 for _, logSymlink := range logSymlinks { 354 if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) { 355 if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil { 356 resp, err := cgc.manager.runtimeService.ContainerStatus(ctx, containerID, false) 357 if err != nil { 358 // TODO: we should handle container not found (i.e. container was deleted) case differently 359 // once https://github.com/kubernetes/kubernetes/issues/63336 is resolved 360 klog.InfoS("Error getting ContainerStatus for containerID", "containerID", containerID, "err", err) 361 } else { 362 status := resp.GetStatus() 363 if status == nil { 364 klog.V(4).InfoS("Container status is nil") 365 continue 366 } 367 if status.State != runtimeapi.ContainerState_CONTAINER_EXITED { 368 // Here is how container log rotation works (see containerLogManager#rotateLatestLog): 369 // 370 // 1. rename current log to rotated log file whose filename contains current timestamp (fmt.Sprintf("%s.%s", log, timestamp)) 371 // 2. reopen the container log 372 // 3. if #2 fails, rename rotated log file back to container log 373 // 374 // There is small but indeterministic amount of time during which log file doesn't exist (between steps #1 and #2, between #1 and #3). 375 // Hence the symlink may be deemed unhealthy during that period. 376 // See https://github.com/kubernetes/kubernetes/issues/52172 377 // 378 // We only remove unhealthy symlink for dead containers 379 klog.V(5).InfoS("Container is still running, not removing symlink", "containerID", containerID, "path", logSymlink) 380 continue 381 } 382 } 383 } else { 384 klog.V(4).InfoS("Unable to obtain container ID", "err", err) 385 } 386 err := osInterface.Remove(logSymlink) 387 if err != nil { 388 klog.ErrorS(err, "Failed to remove container log dead symlink", "path", logSymlink) 389 } else { 390 klog.V(4).InfoS("Removed symlink", "path", logSymlink) 391 } 392 } 393 } 394 return nil 395 } 396 397 // GarbageCollect removes dead containers using the specified container gc policy. 398 // Note that gc policy is not applied to sandboxes. Sandboxes are only removed when they are 399 // not ready and containing no containers. 400 // 401 // GarbageCollect consists of the following steps: 402 // * gets evictable containers which are not active and created more than gcPolicy.MinAge ago. 403 // * removes oldest dead containers for each pod by enforcing gcPolicy.MaxPerPodContainer. 404 // * removes oldest dead containers by enforcing gcPolicy.MaxContainers. 405 // * gets evictable sandboxes which are not ready and contains no containers. 406 // * removes evictable sandboxes. 407 func (cgc *containerGC) GarbageCollect(ctx context.Context, gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error { 408 ctx, otelSpan := cgc.tracer.Start(ctx, "Containers/GarbageCollect") 409 defer otelSpan.End() 410 errors := []error{} 411 // Remove evictable containers 412 if err := cgc.evictContainers(ctx, gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil { 413 errors = append(errors, err) 414 } 415 416 // Remove sandboxes with zero containers 417 if err := cgc.evictSandboxes(ctx, evictNonDeletedPods); err != nil { 418 errors = append(errors, err) 419 } 420 421 // Remove pod sandbox log directory 422 if err := cgc.evictPodLogsDirectories(ctx, allSourcesReady); err != nil { 423 errors = append(errors, err) 424 } 425 return utilerrors.NewAggregate(errors) 426 }