k8s.io/kubernetes@v1.29.3/pkg/kubelet/kuberuntime/kuberuntime_gc.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kuberuntime
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"path/filepath"
    24  	"sort"
    25  	"time"
    26  
    27  	"go.opentelemetry.io/otel/trace"
    28  	"k8s.io/apimachinery/pkg/types"
    29  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	internalapi "k8s.io/cri-api/pkg/apis"
    32  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    33  	"k8s.io/klog/v2"
    34  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    35  )
    36  
    37  // containerGC is the manager of garbage collection.
    38  type containerGC struct {
    39  	client           internalapi.RuntimeService
    40  	manager          *kubeGenericRuntimeManager
    41  	podStateProvider podStateProvider
    42  	tracer           trace.Tracer
    43  }
    44  
    45  // NewContainerGC creates a new containerGC.
    46  func newContainerGC(client internalapi.RuntimeService, podStateProvider podStateProvider, manager *kubeGenericRuntimeManager, tracer trace.Tracer) *containerGC {
    47  	return &containerGC{
    48  		client:           client,
    49  		manager:          manager,
    50  		podStateProvider: podStateProvider,
    51  		tracer:           tracer,
    52  	}
    53  }
    54  
    55  // containerGCInfo is the internal information kept for containers being considered for GC.
    56  type containerGCInfo struct {
    57  	// The ID of the container.
    58  	id string
    59  	// The name of the container.
    60  	name string
    61  	// Creation time for the container.
    62  	createTime time.Time
    63  	// If true, the container is in unknown state. Garbage collector should try
    64  	// to stop containers before removal.
    65  	unknown bool
    66  }
    67  
    68  // sandboxGCInfo is the internal information kept for sandboxes being considered for GC.
    69  type sandboxGCInfo struct {
    70  	// The ID of the sandbox.
    71  	id string
    72  	// Creation time for the sandbox.
    73  	createTime time.Time
    74  	// If true, the sandbox is ready or still has containers.
    75  	active bool
    76  }
    77  
    78  // evictUnit is considered for eviction as units of (UID, container name) pair.
    79  type evictUnit struct {
    80  	// UID of the pod.
    81  	uid types.UID
    82  	// Name of the container in the pod.
    83  	name string
    84  }
    85  
    86  type containersByEvictUnit map[evictUnit][]containerGCInfo
    87  type sandboxesByPodUID map[types.UID][]sandboxGCInfo
    88  
    89  // NumContainers returns the number of containers in this map.
    90  func (cu containersByEvictUnit) NumContainers() int {
    91  	num := 0
    92  	for key := range cu {
    93  		num += len(cu[key])
    94  	}
    95  	return num
    96  }
    97  
    98  // NumEvictUnits returns the number of pod in this map.
    99  func (cu containersByEvictUnit) NumEvictUnits() int {
   100  	return len(cu)
   101  }
   102  
   103  // Newest first.
   104  type byCreated []containerGCInfo
   105  
   106  func (a byCreated) Len() int           { return len(a) }
   107  func (a byCreated) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   108  func (a byCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) }
   109  
   110  // Newest first.
   111  type sandboxByCreated []sandboxGCInfo
   112  
   113  func (a sandboxByCreated) Len() int           { return len(a) }
   114  func (a sandboxByCreated) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   115  func (a sandboxByCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) }
   116  
   117  // enforceMaxContainersPerEvictUnit enforces MaxPerPodContainer for each evictUnit.
   118  func (cgc *containerGC) enforceMaxContainersPerEvictUnit(ctx context.Context, evictUnits containersByEvictUnit, MaxContainers int) {
   119  	for key := range evictUnits {
   120  		toRemove := len(evictUnits[key]) - MaxContainers
   121  
   122  		if toRemove > 0 {
   123  			evictUnits[key] = cgc.removeOldestN(ctx, evictUnits[key], toRemove)
   124  		}
   125  	}
   126  }
   127  
   128  // removeOldestN removes the oldest toRemove containers and returns the resulting slice.
   129  func (cgc *containerGC) removeOldestN(ctx context.Context, containers []containerGCInfo, toRemove int) []containerGCInfo {
   130  	// Remove from oldest to newest (last to first).
   131  	numToKeep := len(containers) - toRemove
   132  	if numToKeep > 0 {
   133  		sort.Sort(byCreated(containers))
   134  	}
   135  	for i := len(containers) - 1; i >= numToKeep; i-- {
   136  		if containers[i].unknown {
   137  			// Containers in known state could be running, we should try
   138  			// to stop it before removal.
   139  			id := kubecontainer.ContainerID{
   140  				Type: cgc.manager.runtimeName,
   141  				ID:   containers[i].id,
   142  			}
   143  			message := "Container is in unknown state, try killing it before removal"
   144  			if err := cgc.manager.killContainer(ctx, nil, id, containers[i].name, message, reasonUnknown, nil, nil); err != nil {
   145  				klog.ErrorS(err, "Failed to stop container", "containerID", containers[i].id)
   146  				continue
   147  			}
   148  		}
   149  		if err := cgc.manager.removeContainer(ctx, containers[i].id); err != nil {
   150  			klog.ErrorS(err, "Failed to remove container", "containerID", containers[i].id)
   151  		}
   152  	}
   153  
   154  	// Assume we removed the containers so that we're not too aggressive.
   155  	return containers[:numToKeep]
   156  }
   157  
   158  // removeOldestNSandboxes removes the oldest inactive toRemove sandboxes and
   159  // returns the resulting slice.
   160  func (cgc *containerGC) removeOldestNSandboxes(ctx context.Context, sandboxes []sandboxGCInfo, toRemove int) {
   161  	numToKeep := len(sandboxes) - toRemove
   162  	if numToKeep > 0 {
   163  		sort.Sort(sandboxByCreated(sandboxes))
   164  	}
   165  	// Remove from oldest to newest (last to first).
   166  	for i := len(sandboxes) - 1; i >= numToKeep; i-- {
   167  		if !sandboxes[i].active {
   168  			cgc.removeSandbox(ctx, sandboxes[i].id)
   169  		}
   170  	}
   171  }
   172  
   173  // removeSandbox removes the sandbox by sandboxID.
   174  func (cgc *containerGC) removeSandbox(ctx context.Context, sandboxID string) {
   175  	klog.V(4).InfoS("Removing sandbox", "sandboxID", sandboxID)
   176  	// In normal cases, kubelet should've already called StopPodSandbox before
   177  	// GC kicks in. To guard against the rare cases where this is not true, try
   178  	// stopping the sandbox before removing it.
   179  	if err := cgc.client.StopPodSandbox(ctx, sandboxID); err != nil {
   180  		klog.ErrorS(err, "Failed to stop sandbox before removing", "sandboxID", sandboxID)
   181  		return
   182  	}
   183  	if err := cgc.client.RemovePodSandbox(ctx, sandboxID); err != nil {
   184  		klog.ErrorS(err, "Failed to remove sandbox", "sandboxID", sandboxID)
   185  	}
   186  }
   187  
   188  // evictableContainers gets all containers that are evictable. Evictable containers are: not running
   189  // and created more than MinAge ago.
   190  func (cgc *containerGC) evictableContainers(ctx context.Context, minAge time.Duration) (containersByEvictUnit, error) {
   191  	containers, err := cgc.manager.getKubeletContainers(ctx, true)
   192  	if err != nil {
   193  		return containersByEvictUnit{}, err
   194  	}
   195  
   196  	evictUnits := make(containersByEvictUnit)
   197  	newestGCTime := time.Now().Add(-minAge)
   198  	for _, container := range containers {
   199  		// Prune out running containers.
   200  		if container.State == runtimeapi.ContainerState_CONTAINER_RUNNING {
   201  			continue
   202  		}
   203  
   204  		createdAt := time.Unix(0, container.CreatedAt)
   205  		if newestGCTime.Before(createdAt) {
   206  			continue
   207  		}
   208  
   209  		labeledInfo := getContainerInfoFromLabels(container.Labels)
   210  		containerInfo := containerGCInfo{
   211  			id:         container.Id,
   212  			name:       container.Metadata.Name,
   213  			createTime: createdAt,
   214  			unknown:    container.State == runtimeapi.ContainerState_CONTAINER_UNKNOWN,
   215  		}
   216  		key := evictUnit{
   217  			uid:  labeledInfo.PodUID,
   218  			name: containerInfo.name,
   219  		}
   220  		evictUnits[key] = append(evictUnits[key], containerInfo)
   221  	}
   222  
   223  	return evictUnits, nil
   224  }
   225  
   226  // evict all containers that are evictable
   227  func (cgc *containerGC) evictContainers(ctx context.Context, gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
   228  	// Separate containers by evict units.
   229  	evictUnits, err := cgc.evictableContainers(ctx, gcPolicy.MinAge)
   230  	if err != nil {
   231  		return err
   232  	}
   233  
   234  	// Remove deleted pod containers if all sources are ready.
   235  	if allSourcesReady {
   236  		for key, unit := range evictUnits {
   237  			if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) {
   238  				cgc.removeOldestN(ctx, unit, len(unit)) // Remove all.
   239  				delete(evictUnits, key)
   240  			}
   241  		}
   242  	}
   243  
   244  	// Enforce max containers per evict unit.
   245  	if gcPolicy.MaxPerPodContainer >= 0 {
   246  		cgc.enforceMaxContainersPerEvictUnit(ctx, evictUnits, gcPolicy.MaxPerPodContainer)
   247  	}
   248  
   249  	// Enforce max total number of containers.
   250  	if gcPolicy.MaxContainers >= 0 && evictUnits.NumContainers() > gcPolicy.MaxContainers {
   251  		// Leave an equal number of containers per evict unit (min: 1).
   252  		numContainersPerEvictUnit := gcPolicy.MaxContainers / evictUnits.NumEvictUnits()
   253  		if numContainersPerEvictUnit < 1 {
   254  			numContainersPerEvictUnit = 1
   255  		}
   256  		cgc.enforceMaxContainersPerEvictUnit(ctx, evictUnits, numContainersPerEvictUnit)
   257  
   258  		// If we still need to evict, evict oldest first.
   259  		numContainers := evictUnits.NumContainers()
   260  		if numContainers > gcPolicy.MaxContainers {
   261  			flattened := make([]containerGCInfo, 0, numContainers)
   262  			for key := range evictUnits {
   263  				flattened = append(flattened, evictUnits[key]...)
   264  			}
   265  			sort.Sort(byCreated(flattened))
   266  
   267  			cgc.removeOldestN(ctx, flattened, numContainers-gcPolicy.MaxContainers)
   268  		}
   269  	}
   270  	return nil
   271  }
   272  
   273  // evictSandboxes remove all evictable sandboxes. An evictable sandbox must
   274  // meet the following requirements:
   275  //  1. not in ready state
   276  //  2. contains no containers.
   277  //  3. belong to a non-existent (i.e., already removed) pod, or is not the
   278  //     most recently created sandbox for the pod.
   279  func (cgc *containerGC) evictSandboxes(ctx context.Context, evictNonDeletedPods bool) error {
   280  	containers, err := cgc.manager.getKubeletContainers(ctx, true)
   281  	if err != nil {
   282  		return err
   283  	}
   284  
   285  	sandboxes, err := cgc.manager.getKubeletSandboxes(ctx, true)
   286  	if err != nil {
   287  		return err
   288  	}
   289  
   290  	// collect all the PodSandboxId of container
   291  	sandboxIDs := sets.NewString()
   292  	for _, container := range containers {
   293  		sandboxIDs.Insert(container.PodSandboxId)
   294  	}
   295  
   296  	sandboxesByPod := make(sandboxesByPodUID, len(sandboxes))
   297  	for _, sandbox := range sandboxes {
   298  		podUID := types.UID(sandbox.Metadata.Uid)
   299  		sandboxInfo := sandboxGCInfo{
   300  			id:         sandbox.Id,
   301  			createTime: time.Unix(0, sandbox.CreatedAt),
   302  		}
   303  
   304  		// Set ready sandboxes and sandboxes that still have containers to be active.
   305  		if sandbox.State == runtimeapi.PodSandboxState_SANDBOX_READY || sandboxIDs.Has(sandbox.Id) {
   306  			sandboxInfo.active = true
   307  		}
   308  
   309  		sandboxesByPod[podUID] = append(sandboxesByPod[podUID], sandboxInfo)
   310  	}
   311  
   312  	for podUID, sandboxes := range sandboxesByPod {
   313  		if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) {
   314  			// Remove all evictable sandboxes if the pod has been removed.
   315  			// Note that the latest dead sandbox is also removed if there is
   316  			// already an active one.
   317  			cgc.removeOldestNSandboxes(ctx, sandboxes, len(sandboxes))
   318  		} else {
   319  			// Keep latest one if the pod still exists.
   320  			cgc.removeOldestNSandboxes(ctx, sandboxes, len(sandboxes)-1)
   321  		}
   322  	}
   323  	return nil
   324  }
   325  
   326  // evictPodLogsDirectories evicts all evictable pod logs directories. Pod logs directories
   327  // are evictable if there are no corresponding pods.
   328  func (cgc *containerGC) evictPodLogsDirectories(ctx context.Context, allSourcesReady bool) error {
   329  	osInterface := cgc.manager.osInterface
   330  	if allSourcesReady {
   331  		// Only remove pod logs directories when all sources are ready.
   332  		dirs, err := osInterface.ReadDir(podLogsRootDirectory)
   333  		if err != nil {
   334  			return fmt.Errorf("failed to read podLogsRootDirectory %q: %v", podLogsRootDirectory, err)
   335  		}
   336  		for _, dir := range dirs {
   337  			name := dir.Name()
   338  			podUID := parsePodUIDFromLogsDirectory(name)
   339  			if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) {
   340  				continue
   341  			}
   342  			klog.V(4).InfoS("Removing pod logs", "podUID", podUID)
   343  			err := osInterface.RemoveAll(filepath.Join(podLogsRootDirectory, name))
   344  			if err != nil {
   345  				klog.ErrorS(err, "Failed to remove pod logs directory", "path", name)
   346  			}
   347  		}
   348  	}
   349  
   350  	// Remove dead container log symlinks.
   351  	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
   352  	logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix)))
   353  	for _, logSymlink := range logSymlinks {
   354  		if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) {
   355  			if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil {
   356  				resp, err := cgc.manager.runtimeService.ContainerStatus(ctx, containerID, false)
   357  				if err != nil {
   358  					// TODO: we should handle container not found (i.e. container was deleted) case differently
   359  					// once https://github.com/kubernetes/kubernetes/issues/63336 is resolved
   360  					klog.InfoS("Error getting ContainerStatus for containerID", "containerID", containerID, "err", err)
   361  				} else {
   362  					status := resp.GetStatus()
   363  					if status == nil {
   364  						klog.V(4).InfoS("Container status is nil")
   365  						continue
   366  					}
   367  					if status.State != runtimeapi.ContainerState_CONTAINER_EXITED {
   368  						// Here is how container log rotation works (see containerLogManager#rotateLatestLog):
   369  						//
   370  						// 1. rename current log to rotated log file whose filename contains current timestamp (fmt.Sprintf("%s.%s", log, timestamp))
   371  						// 2. reopen the container log
   372  						// 3. if #2 fails, rename rotated log file back to container log
   373  						//
   374  						// There is small but indeterministic amount of time during which log file doesn't exist (between steps #1 and #2, between #1 and #3).
   375  						// Hence the symlink may be deemed unhealthy during that period.
   376  						// See https://github.com/kubernetes/kubernetes/issues/52172
   377  						//
   378  						// We only remove unhealthy symlink for dead containers
   379  						klog.V(5).InfoS("Container is still running, not removing symlink", "containerID", containerID, "path", logSymlink)
   380  						continue
   381  					}
   382  				}
   383  			} else {
   384  				klog.V(4).InfoS("Unable to obtain container ID", "err", err)
   385  			}
   386  			err := osInterface.Remove(logSymlink)
   387  			if err != nil {
   388  				klog.ErrorS(err, "Failed to remove container log dead symlink", "path", logSymlink)
   389  			} else {
   390  				klog.V(4).InfoS("Removed symlink", "path", logSymlink)
   391  			}
   392  		}
   393  	}
   394  	return nil
   395  }
   396  
   397  // GarbageCollect removes dead containers using the specified container gc policy.
   398  // Note that gc policy is not applied to sandboxes. Sandboxes are only removed when they are
   399  // not ready and containing no containers.
   400  //
   401  // GarbageCollect consists of the following steps:
   402  // * gets evictable containers which are not active and created more than gcPolicy.MinAge ago.
   403  // * removes oldest dead containers for each pod by enforcing gcPolicy.MaxPerPodContainer.
   404  // * removes oldest dead containers by enforcing gcPolicy.MaxContainers.
   405  // * gets evictable sandboxes which are not ready and contains no containers.
   406  // * removes evictable sandboxes.
   407  func (cgc *containerGC) GarbageCollect(ctx context.Context, gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
   408  	ctx, otelSpan := cgc.tracer.Start(ctx, "Containers/GarbageCollect")
   409  	defer otelSpan.End()
   410  	errors := []error{}
   411  	// Remove evictable containers
   412  	if err := cgc.evictContainers(ctx, gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil {
   413  		errors = append(errors, err)
   414  	}
   415  
   416  	// Remove sandboxes with zero containers
   417  	if err := cgc.evictSandboxes(ctx, evictNonDeletedPods); err != nil {
   418  		errors = append(errors, err)
   419  	}
   420  
   421  	// Remove pod sandbox log directory
   422  	if err := cgc.evictPodLogsDirectories(ctx, allSourcesReady); err != nil {
   423  		errors = append(errors, err)
   424  	}
   425  	return utilerrors.NewAggregate(errors)
   426  }