k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kuberuntime/kuberuntime_container.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kuberuntime/kuberuntime_container.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kuberuntime
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math/rand"
    25  	"net/url"
    26  	"os"
    27  	"path/filepath"
    28  	"regexp"
    29  	goruntime "runtime"
    30  	"sort"
    31  	"strconv"
    32  	"strings"
    33  	"sync"
    34  	"time"
    35  
    36  	crierror "k8s.io/cri-api/pkg/errors"
    37  
    38  	"github.com/opencontainers/selinux/go-selinux"
    39  	grpcstatus "google.golang.org/grpc/status"
    40  
    41  	"github.com/armon/circbuf"
    42  	"k8s.io/klog/v2"
    43  
    44  	v1 "k8s.io/api/core/v1"
    45  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    46  	kubetypes "k8s.io/apimachinery/pkg/types"
    47  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    48  	"k8s.io/apimachinery/pkg/util/sets"
    49  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    50  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    51  	remote "k8s.io/cri-client/pkg"
    52  	kubelettypes "k8s.io/kubelet/pkg/types"
    53  	"k8s.io/kubernetes/pkg/features"
    54  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    55  	"k8s.io/kubernetes/pkg/kubelet/events"
    56  	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
    57  	"k8s.io/kubernetes/pkg/kubelet/types"
    58  	"k8s.io/kubernetes/pkg/kubelet/util/format"
    59  	"k8s.io/kubernetes/pkg/util/tail"
    60  	volumeutil "k8s.io/kubernetes/pkg/volume/util"
    61  )
    62  
    63  var (
    64  	// ErrCreateContainerConfig - failed to create container config
    65  	ErrCreateContainerConfig = errors.New("CreateContainerConfigError")
    66  	// ErrPreCreateHook - failed to execute PreCreateHook
    67  	ErrPreCreateHook = errors.New("PreCreateHookError")
    68  	// ErrCreateContainer - failed to create container
    69  	ErrCreateContainer = errors.New("CreateContainerError")
    70  	// ErrPreStartHook - failed to execute PreStartHook
    71  	ErrPreStartHook = errors.New("PreStartHookError")
    72  	// ErrPostStartHook - failed to execute PostStartHook
    73  	ErrPostStartHook = errors.New("PostStartHookError")
    74  )
    75  
    76  // recordContainerEvent should be used by the runtime manager for all container related events.
    77  // it has sanity checks to ensure that we do not write events that can abuse our masters.
    78  // in particular, it ensures that a containerID never appears in an event message as that
    79  // is prone to causing a lot of distinct events that do not count well.
    80  // it replaces any reference to a containerID with the containerName which is stable, and is what users know.
    81  func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) {
    82  	ref, err := kubecontainer.GenerateContainerRef(pod, container)
    83  	if err != nil {
    84  		klog.ErrorS(err, "Can't make a container ref", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name)
    85  		return
    86  	}
    87  	eventMessage := message
    88  	if len(args) > 0 {
    89  		eventMessage = fmt.Sprintf(message, args...)
    90  	}
    91  	// this is a hack, but often the error from the runtime includes the containerID
    92  	// which kills our ability to deduplicate events.  this protection makes a huge
    93  	// difference in the number of unique events
    94  	if containerID != "" {
    95  		eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1)
    96  	}
    97  	m.recorder.Event(ref, eventType, reason, eventMessage)
    98  }
    99  
   100  // startSpec wraps the spec required to start a container, either a regular/init container
   101  // or an ephemeral container. Ephemeral containers contain all the fields of regular/init
   102  // containers, plus some additional fields. In both cases startSpec.container will be set.
   103  type startSpec struct {
   104  	container          *v1.Container
   105  	ephemeralContainer *v1.EphemeralContainer
   106  }
   107  
   108  func containerStartSpec(c *v1.Container) *startSpec {
   109  	return &startSpec{container: c}
   110  }
   111  
   112  func ephemeralContainerStartSpec(ec *v1.EphemeralContainer) *startSpec {
   113  	return &startSpec{
   114  		container:          (*v1.Container)(&ec.EphemeralContainerCommon),
   115  		ephemeralContainer: ec,
   116  	}
   117  }
   118  
   119  // getTargetID returns the kubecontainer.ContainerID for ephemeral container namespace
   120  // targeting. The target is stored as EphemeralContainer.TargetContainerName, which must be
   121  // resolved to a ContainerID using podStatus. The target container must already exist, which
   122  // usually isn't a problem since ephemeral containers aren't allowed at pod creation time.
   123  func (s *startSpec) getTargetID(podStatus *kubecontainer.PodStatus) (*kubecontainer.ContainerID, error) {
   124  	if s.ephemeralContainer == nil || s.ephemeralContainer.TargetContainerName == "" {
   125  		return nil, nil
   126  	}
   127  
   128  	targetStatus := podStatus.FindContainerStatusByName(s.ephemeralContainer.TargetContainerName)
   129  	if targetStatus == nil {
   130  		return nil, fmt.Errorf("unable to find target container %v", s.ephemeralContainer.TargetContainerName)
   131  	}
   132  
   133  	return &targetStatus.ID, nil
   134  }
   135  
   136  func calcRestartCountByLogDir(path string) (int, error) {
   137  	// if the path doesn't exist then it's not an error
   138  	if _, err := os.Stat(path); err != nil {
   139  		return 0, nil
   140  	}
   141  	files, err := os.ReadDir(path)
   142  	if err != nil {
   143  		return 0, err
   144  	}
   145  	if len(files) == 0 {
   146  		return 0, nil
   147  	}
   148  	restartCount := 0
   149  	restartCountLogFileRegex := regexp.MustCompile(`^(\d+)\.log(\..*)?`)
   150  	for _, file := range files {
   151  		if file.IsDir() {
   152  			continue
   153  		}
   154  		matches := restartCountLogFileRegex.FindStringSubmatch(file.Name())
   155  		if len(matches) == 0 {
   156  			continue
   157  		}
   158  		count, err := strconv.Atoi(matches[1])
   159  		if err != nil {
   160  			// unlikely kubelet created this file,
   161  			// likely custom file with random numbers as a name
   162  			continue
   163  		}
   164  		count++
   165  		if count > restartCount {
   166  			restartCount = count
   167  		}
   168  	}
   169  	return restartCount, nil
   170  }
   171  
   172  // startContainer starts a container and returns a message indicates why it is failed on error.
   173  // It starts the container through the following steps:
   174  // * pull the image
   175  // * create the container
   176  // * start the container
   177  // * run the post start lifecycle hooks (if applicable)
   178  func (m *kubeGenericRuntimeManager) startContainer(ctx context.Context, podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) {
   179  	container := spec.container
   180  
   181  	// Step 1: pull the image.
   182  
   183  	// If RuntimeClassInImageCriAPI feature gate is enabled, pass runtimehandler
   184  	// information for the runtime class specified. If not runtime class is
   185  	// specified, then pass ""
   186  	podRuntimeHandler := ""
   187  	var err error
   188  	if utilfeature.DefaultFeatureGate.Enabled(features.RuntimeClassInImageCriAPI) {
   189  		if pod.Spec.RuntimeClassName != nil && *pod.Spec.RuntimeClassName != "" {
   190  			podRuntimeHandler, err = m.runtimeClassManager.LookupRuntimeHandler(pod.Spec.RuntimeClassName)
   191  			if err != nil {
   192  				msg := fmt.Sprintf("Failed to lookup runtimeHandler for runtimeClassName %v", pod.Spec.RuntimeClassName)
   193  				return msg, err
   194  			}
   195  		}
   196  	}
   197  
   198  	imageRef, msg, err := m.imagePuller.EnsureImageExists(ctx, pod, container, pullSecrets, podSandboxConfig, podRuntimeHandler)
   199  	if err != nil {
   200  		s, _ := grpcstatus.FromError(err)
   201  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   202  		return msg, err
   203  	}
   204  
   205  	// Step 2: create the container.
   206  	// For a new container, the RestartCount should be 0
   207  	restartCount := 0
   208  	containerStatus := podStatus.FindContainerStatusByName(container.Name)
   209  	if containerStatus != nil {
   210  		restartCount = containerStatus.RestartCount + 1
   211  	} else {
   212  		// The container runtime keeps state on container statuses and
   213  		// what the container restart count is. When nodes are rebooted
   214  		// some container runtimes clear their state which causes the
   215  		// restartCount to be reset to 0. This causes the logfile to
   216  		// start at 0.log, which either overwrites or appends to the
   217  		// already existing log.
   218  		//
   219  		// We are checking to see if the log directory exists, and find
   220  		// the latest restartCount by checking the log name -
   221  		// {restartCount}.log - and adding 1 to it.
   222  		logDir := BuildContainerLogsDirectory(m.podLogsDirectory, pod.Namespace, pod.Name, pod.UID, container.Name)
   223  		restartCount, err = calcRestartCountByLogDir(logDir)
   224  		if err != nil {
   225  			klog.InfoS("Cannot calculate restartCount from the log directory", "logDir", logDir, "err", err)
   226  			restartCount = 0
   227  		}
   228  	}
   229  
   230  	target, err := spec.getTargetID(podStatus)
   231  	if err != nil {
   232  		s, _ := grpcstatus.FromError(err)
   233  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   234  		return s.Message(), ErrCreateContainerConfig
   235  	}
   236  
   237  	containerConfig, cleanupAction, err := m.generateContainerConfig(ctx, container, pod, restartCount, podIP, imageRef, podIPs, target)
   238  	if cleanupAction != nil {
   239  		defer cleanupAction()
   240  	}
   241  	if err != nil {
   242  		s, _ := grpcstatus.FromError(err)
   243  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   244  		return s.Message(), ErrCreateContainerConfig
   245  	}
   246  
   247  	err = m.internalLifecycle.PreCreateContainer(pod, container, containerConfig)
   248  	if err != nil {
   249  		s, _ := grpcstatus.FromError(err)
   250  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Internal PreCreateContainer hook failed: %v", s.Message())
   251  		return s.Message(), ErrPreCreateHook
   252  	}
   253  
   254  	containerID, err := m.runtimeService.CreateContainer(ctx, podSandboxID, containerConfig, podSandboxConfig)
   255  	if err != nil {
   256  		s, _ := grpcstatus.FromError(err)
   257  		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   258  		return s.Message(), ErrCreateContainer
   259  	}
   260  	err = m.internalLifecycle.PreStartContainer(pod, container, containerID)
   261  	if err != nil {
   262  		s, _ := grpcstatus.FromError(err)
   263  		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Internal PreStartContainer hook failed: %v", s.Message())
   264  		return s.Message(), ErrPreStartHook
   265  	}
   266  	m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, fmt.Sprintf("Created container %s", container.Name))
   267  
   268  	// Step 3: start the container.
   269  	err = m.runtimeService.StartContainer(ctx, containerID)
   270  	if err != nil {
   271  		s, _ := grpcstatus.FromError(err)
   272  		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", s.Message())
   273  		return s.Message(), kubecontainer.ErrRunContainer
   274  	}
   275  	m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, fmt.Sprintf("Started container %s", container.Name))
   276  
   277  	// Symlink container logs to the legacy container log location for cluster logging
   278  	// support.
   279  	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
   280  	containerMeta := containerConfig.GetMetadata()
   281  	sandboxMeta := podSandboxConfig.GetMetadata()
   282  	legacySymlink := legacyLogSymlink(containerID, containerMeta.Name, sandboxMeta.Name,
   283  		sandboxMeta.Namespace)
   284  	containerLog := filepath.Join(podSandboxConfig.LogDirectory, containerConfig.LogPath)
   285  	// only create legacy symlink if containerLog path exists (or the error is not IsNotExist).
   286  	// Because if containerLog path does not exist, only dangling legacySymlink is created.
   287  	// This dangling legacySymlink is later removed by container gc, so it does not make sense
   288  	// to create it in the first place. it happens when journald logging driver is used with docker.
   289  	if _, err := m.osInterface.Stat(containerLog); !os.IsNotExist(err) {
   290  		if err := m.osInterface.Symlink(containerLog, legacySymlink); err != nil {
   291  			klog.ErrorS(err, "Failed to create legacy symbolic link", "path", legacySymlink,
   292  				"containerID", containerID, "containerLogPath", containerLog)
   293  		}
   294  	}
   295  
   296  	// Step 4: execute the post start hook.
   297  	if container.Lifecycle != nil && container.Lifecycle.PostStart != nil {
   298  		kubeContainerID := kubecontainer.ContainerID{
   299  			Type: m.runtimeName,
   300  			ID:   containerID,
   301  		}
   302  		msg, handlerErr := m.runner.Run(ctx, kubeContainerID, pod, container, container.Lifecycle.PostStart)
   303  		if handlerErr != nil {
   304  			klog.ErrorS(handlerErr, "Failed to execute PostStartHook", "pod", klog.KObj(pod),
   305  				"podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
   306  			// do not record the message in the event so that secrets won't leak from the server.
   307  			m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, "PostStartHook failed")
   308  			if err := m.killContainer(ctx, pod, kubeContainerID, container.Name, "FailedPostStartHook", reasonFailedPostStartHook, nil, nil); err != nil {
   309  				klog.ErrorS(err, "Failed to kill container", "pod", klog.KObj(pod),
   310  					"podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
   311  			}
   312  			return msg, ErrPostStartHook
   313  		}
   314  	}
   315  
   316  	return "", nil
   317  }
   318  
   319  // generateContainerConfig generates container config for kubelet runtime v1.
   320  func (m *kubeGenericRuntimeManager) generateContainerConfig(ctx context.Context, container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string, podIPs []string, nsTarget *kubecontainer.ContainerID) (*runtimeapi.ContainerConfig, func(), error) {
   321  	opts, cleanupAction, err := m.runtimeHelper.GenerateRunContainerOptions(ctx, pod, container, podIP, podIPs)
   322  	if err != nil {
   323  		return nil, nil, err
   324  	}
   325  
   326  	uid, username, err := m.getImageUser(ctx, container.Image)
   327  	if err != nil {
   328  		return nil, cleanupAction, err
   329  	}
   330  
   331  	// Verify RunAsNonRoot. Non-root verification only supports numeric user.
   332  	if err := verifyRunAsNonRoot(pod, container, uid, username); err != nil {
   333  		return nil, cleanupAction, err
   334  	}
   335  
   336  	command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs)
   337  	logDir := BuildContainerLogsDirectory(m.podLogsDirectory, pod.Namespace, pod.Name, pod.UID, container.Name)
   338  	err = m.osInterface.MkdirAll(logDir, 0755)
   339  	if err != nil {
   340  		return nil, cleanupAction, fmt.Errorf("create container log directory for container %s failed: %v", container.Name, err)
   341  	}
   342  	containerLogsPath := buildContainerLogsPath(container.Name, restartCount)
   343  	restartCountUint32 := uint32(restartCount)
   344  	config := &runtimeapi.ContainerConfig{
   345  		Metadata: &runtimeapi.ContainerMetadata{
   346  			Name:    container.Name,
   347  			Attempt: restartCountUint32,
   348  		},
   349  		Image:       &runtimeapi.ImageSpec{Image: imageRef, UserSpecifiedImage: container.Image},
   350  		Command:     command,
   351  		Args:        args,
   352  		WorkingDir:  container.WorkingDir,
   353  		Labels:      newContainerLabels(container, pod),
   354  		Annotations: newContainerAnnotations(container, pod, restartCount, opts),
   355  		Devices:     makeDevices(opts),
   356  		CDIDevices:  makeCDIDevices(opts),
   357  		Mounts:      m.makeMounts(opts, container),
   358  		LogPath:     containerLogsPath,
   359  		Stdin:       container.Stdin,
   360  		StdinOnce:   container.StdinOnce,
   361  		Tty:         container.TTY,
   362  	}
   363  
   364  	// set platform specific configurations.
   365  	if err := m.applyPlatformSpecificContainerConfig(config, container, pod, uid, username, nsTarget); err != nil {
   366  		return nil, cleanupAction, err
   367  	}
   368  
   369  	// set environment variables
   370  	envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
   371  	for idx := range opts.Envs {
   372  		e := opts.Envs[idx]
   373  		envs[idx] = &runtimeapi.KeyValue{
   374  			Key:   e.Name,
   375  			Value: e.Value,
   376  		}
   377  	}
   378  	config.Envs = envs
   379  
   380  	return config, cleanupAction, nil
   381  }
   382  
   383  func (m *kubeGenericRuntimeManager) updateContainerResources(pod *v1.Pod, container *v1.Container, containerID kubecontainer.ContainerID) error {
   384  	containerResources := m.generateContainerResources(pod, container)
   385  	if containerResources == nil {
   386  		return fmt.Errorf("container %q updateContainerResources failed: cannot generate resources config", containerID.String())
   387  	}
   388  	ctx := context.Background()
   389  	err := m.runtimeService.UpdateContainerResources(ctx, containerID.ID, containerResources)
   390  	if err != nil {
   391  		klog.ErrorS(err, "UpdateContainerResources failed", "container", containerID.String())
   392  	}
   393  	return err
   394  }
   395  
   396  // makeDevices generates container devices for kubelet runtime v1.
   397  func makeDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.Device {
   398  	devices := make([]*runtimeapi.Device, len(opts.Devices))
   399  
   400  	for idx := range opts.Devices {
   401  		device := opts.Devices[idx]
   402  		devices[idx] = &runtimeapi.Device{
   403  			HostPath:      device.PathOnHost,
   404  			ContainerPath: device.PathInContainer,
   405  			Permissions:   device.Permissions,
   406  		}
   407  	}
   408  
   409  	return devices
   410  }
   411  
   412  // makeCDIDevices generates container CDIDevices for kubelet runtime v1.
   413  func makeCDIDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.CDIDevice {
   414  	devices := make([]*runtimeapi.CDIDevice, len(opts.CDIDevices))
   415  
   416  	for i, device := range opts.CDIDevices {
   417  		devices[i] = &runtimeapi.CDIDevice{
   418  			Name: device.Name,
   419  		}
   420  	}
   421  
   422  	return devices
   423  }
   424  
   425  // makeMounts generates container volume mounts for kubelet runtime v1.
   426  func (m *kubeGenericRuntimeManager) makeMounts(opts *kubecontainer.RunContainerOptions, container *v1.Container) []*runtimeapi.Mount {
   427  	volumeMounts := []*runtimeapi.Mount{}
   428  
   429  	for idx := range opts.Mounts {
   430  		v := opts.Mounts[idx]
   431  		selinuxRelabel := v.SELinuxRelabel && selinux.GetEnabled()
   432  		mount := &runtimeapi.Mount{
   433  			HostPath:          v.HostPath,
   434  			ContainerPath:     v.ContainerPath,
   435  			Readonly:          v.ReadOnly,
   436  			SelinuxRelabel:    selinuxRelabel,
   437  			Propagation:       v.Propagation,
   438  			RecursiveReadOnly: v.RecursiveReadOnly,
   439  		}
   440  
   441  		volumeMounts = append(volumeMounts, mount)
   442  	}
   443  
   444  	// The reason we create and mount the log file in here (not in kubelet) is because
   445  	// the file's location depends on the ID of the container, and we need to create and
   446  	// mount the file before actually starting the container.
   447  	if opts.PodContainerDir != "" && len(container.TerminationMessagePath) != 0 {
   448  		// Because the PodContainerDir contains pod uid and container name which is unique enough,
   449  		// here we just add a random id to make the path unique for different instances
   450  		// of the same container.
   451  		cid := makeUID()
   452  		containerLogPath := filepath.Join(opts.PodContainerDir, cid)
   453  		fs, err := m.osInterface.Create(containerLogPath)
   454  		if err != nil {
   455  			utilruntime.HandleError(fmt.Errorf("error on creating termination-log file %q: %v", containerLogPath, err))
   456  		} else {
   457  			fs.Close()
   458  
   459  			// Chmod is needed because os.Create() ends up calling
   460  			// open(2) to create the file, so the final mode used is "mode &
   461  			// ~umask". But we want to make sure the specified mode is used
   462  			// in the file no matter what the umask is.
   463  			if err := m.osInterface.Chmod(containerLogPath, 0666); err != nil {
   464  				utilruntime.HandleError(fmt.Errorf("unable to set termination-log file permissions %q: %v", containerLogPath, err))
   465  			}
   466  
   467  			// Volume Mounts fail on Windows if it is not of the form C:/
   468  			containerLogPath = volumeutil.MakeAbsolutePath(goruntime.GOOS, containerLogPath)
   469  			terminationMessagePath := volumeutil.MakeAbsolutePath(goruntime.GOOS, container.TerminationMessagePath)
   470  			selinuxRelabel := selinux.GetEnabled()
   471  			volumeMounts = append(volumeMounts, &runtimeapi.Mount{
   472  				HostPath:       containerLogPath,
   473  				ContainerPath:  terminationMessagePath,
   474  				SelinuxRelabel: selinuxRelabel,
   475  			})
   476  		}
   477  	}
   478  
   479  	return volumeMounts
   480  }
   481  
   482  // getKubeletContainers lists containers managed by kubelet.
   483  // The boolean parameter specifies whether returns all containers including
   484  // those already exited and dead containers (used for garbage collection).
   485  func (m *kubeGenericRuntimeManager) getKubeletContainers(ctx context.Context, allContainers bool) ([]*runtimeapi.Container, error) {
   486  	filter := &runtimeapi.ContainerFilter{}
   487  	if !allContainers {
   488  		filter.State = &runtimeapi.ContainerStateValue{
   489  			State: runtimeapi.ContainerState_CONTAINER_RUNNING,
   490  		}
   491  	}
   492  
   493  	containers, err := m.runtimeService.ListContainers(ctx, filter)
   494  	if err != nil {
   495  		klog.ErrorS(err, "ListContainers failed")
   496  		return nil, err
   497  	}
   498  
   499  	return containers, nil
   500  }
   501  
   502  // makeUID returns a randomly generated string.
   503  func makeUID() string {
   504  	return fmt.Sprintf("%08x", rand.Uint32())
   505  }
   506  
   507  // getTerminationMessage looks on the filesystem for the provided termination message path, returning a limited
   508  // amount of those bytes, or returns true if the logs should be checked.
   509  func getTerminationMessage(status *runtimeapi.ContainerStatus, terminationMessagePath string, fallbackToLogs bool) (string, bool) {
   510  	if len(terminationMessagePath) == 0 {
   511  		return "", fallbackToLogs
   512  	}
   513  	// Volume Mounts fail on Windows if it is not of the form C:/
   514  	terminationMessagePath = volumeutil.MakeAbsolutePath(goruntime.GOOS, terminationMessagePath)
   515  	for _, mount := range status.Mounts {
   516  		if mount.ContainerPath != terminationMessagePath {
   517  			continue
   518  		}
   519  		path := mount.HostPath
   520  		data, _, err := tail.ReadAtMost(path, kubecontainer.MaxContainerTerminationMessageLength)
   521  		if err != nil {
   522  			if os.IsNotExist(err) {
   523  				return "", fallbackToLogs
   524  			}
   525  			return fmt.Sprintf("Error on reading termination log %s: %v", path, err), false
   526  		}
   527  		return string(data), (fallbackToLogs && len(data) == 0)
   528  	}
   529  	return "", fallbackToLogs
   530  }
   531  
   532  // readLastStringFromContainerLogs attempts to read up to the max log length from the end of the CRI log represented
   533  // by path. It reads up to max log lines.
   534  func (m *kubeGenericRuntimeManager) readLastStringFromContainerLogs(path string) string {
   535  	value := int64(kubecontainer.MaxContainerTerminationMessageLogLines)
   536  	buf, _ := circbuf.NewBuffer(kubecontainer.MaxContainerTerminationMessageLogLength)
   537  	if err := m.ReadLogs(context.Background(), path, "", &v1.PodLogOptions{TailLines: &value}, buf, buf); err != nil {
   538  		return fmt.Sprintf("Error on reading termination message from logs: %v", err)
   539  	}
   540  	return buf.String()
   541  }
   542  
   543  func (m *kubeGenericRuntimeManager) convertToKubeContainerStatus(status *runtimeapi.ContainerStatus) (cStatus *kubecontainer.Status) {
   544  	cStatus = toKubeContainerStatus(status, m.runtimeName)
   545  	if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
   546  		// Populate the termination message if needed.
   547  		annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
   548  		// If a container cannot even be started, it certainly does not have logs, so no need to fallbackToLogs.
   549  		fallbackToLogs := annotatedInfo.TerminationMessagePolicy == v1.TerminationMessageFallbackToLogsOnError &&
   550  			cStatus.ExitCode != 0 && cStatus.Reason != "ContainerCannotRun"
   551  		tMessage, checkLogs := getTerminationMessage(status, annotatedInfo.TerminationMessagePath, fallbackToLogs)
   552  		if checkLogs {
   553  			tMessage = m.readLastStringFromContainerLogs(status.GetLogPath())
   554  		}
   555  		// Enrich the termination message written by the application is not empty
   556  		if len(tMessage) != 0 {
   557  			if len(cStatus.Message) != 0 {
   558  				cStatus.Message += ": "
   559  			}
   560  			cStatus.Message += tMessage
   561  		}
   562  	}
   563  	return cStatus
   564  }
   565  
   566  // getPodContainerStatuses gets all containers' statuses for the pod.
   567  func (m *kubeGenericRuntimeManager) getPodContainerStatuses(ctx context.Context, uid kubetypes.UID, name, namespace string) ([]*kubecontainer.Status, error) {
   568  	// Select all containers of the given pod.
   569  	containers, err := m.runtimeService.ListContainers(ctx, &runtimeapi.ContainerFilter{
   570  		LabelSelector: map[string]string{kubelettypes.KubernetesPodUIDLabel: string(uid)},
   571  	})
   572  	if err != nil {
   573  		klog.ErrorS(err, "ListContainers error")
   574  		return nil, err
   575  	}
   576  
   577  	statuses := []*kubecontainer.Status{}
   578  	// TODO: optimization: set maximum number of containers per container name to examine.
   579  	for _, c := range containers {
   580  		resp, err := m.runtimeService.ContainerStatus(ctx, c.Id, false)
   581  		// Between List (ListContainers) and check (ContainerStatus) another thread might remove a container, and that is normal.
   582  		// The previous call (ListContainers) never fails due to a pod container not existing.
   583  		// Therefore, this method should not either, but instead act as if the previous call failed,
   584  		// which means the error should be ignored.
   585  		if crierror.IsNotFound(err) {
   586  			continue
   587  		}
   588  		if err != nil {
   589  			// Merely log this here; GetPodStatus will actually report the error out.
   590  			klog.V(4).InfoS("ContainerStatus return error", "containerID", c.Id, "err", err)
   591  			return nil, err
   592  		}
   593  		status := resp.GetStatus()
   594  		if status == nil {
   595  			return nil, remote.ErrContainerStatusNil
   596  		}
   597  		cStatus := m.convertToKubeContainerStatus(status)
   598  		statuses = append(statuses, cStatus)
   599  	}
   600  
   601  	sort.Sort(containerStatusByCreated(statuses))
   602  	return statuses, nil
   603  }
   604  
   605  func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName string) *kubecontainer.Status {
   606  	annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
   607  	labeledInfo := getContainerInfoFromLabels(status.Labels)
   608  	var cStatusResources *kubecontainer.ContainerResources
   609  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
   610  		// If runtime reports cpu & memory resources info, add it to container status
   611  		cStatusResources = toKubeContainerResources(status.Resources)
   612  	}
   613  
   614  	// Keep backwards compatibility to older runtimes, status.ImageId has been added in v1.30
   615  	imageID := status.ImageRef
   616  	if status.ImageId != "" {
   617  		imageID = status.ImageId
   618  	}
   619  
   620  	cStatus := &kubecontainer.Status{
   621  		ID: kubecontainer.ContainerID{
   622  			Type: runtimeName,
   623  			ID:   status.Id,
   624  		},
   625  		Name:                 labeledInfo.ContainerName,
   626  		Image:                status.Image.Image,
   627  		ImageID:              imageID,
   628  		ImageRef:             status.ImageRef,
   629  		ImageRuntimeHandler:  status.Image.RuntimeHandler,
   630  		Hash:                 annotatedInfo.Hash,
   631  		HashWithoutResources: annotatedInfo.HashWithoutResources,
   632  		RestartCount:         annotatedInfo.RestartCount,
   633  		State:                toKubeContainerState(status.State),
   634  		CreatedAt:            time.Unix(0, status.CreatedAt),
   635  		Resources:            cStatusResources,
   636  	}
   637  
   638  	if status.State != runtimeapi.ContainerState_CONTAINER_CREATED {
   639  		// If container is not in the created state, we have tried and
   640  		// started the container. Set the StartedAt time.
   641  		cStatus.StartedAt = time.Unix(0, status.StartedAt)
   642  	}
   643  	if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
   644  		cStatus.Reason = status.Reason
   645  		cStatus.Message = status.Message
   646  		cStatus.ExitCode = int(status.ExitCode)
   647  		cStatus.FinishedAt = time.Unix(0, status.FinishedAt)
   648  	}
   649  	return cStatus
   650  }
   651  
   652  // executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes.
   653  func (m *kubeGenericRuntimeManager) executePreStopHook(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 {
   654  	klog.V(3).InfoS("Running preStop hook", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerSpec.Name, "containerID", containerID.String())
   655  
   656  	start := metav1.Now()
   657  	done := make(chan struct{})
   658  	go func() {
   659  		defer close(done)
   660  		defer utilruntime.HandleCrash()
   661  		if _, err := m.runner.Run(ctx, containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil {
   662  			klog.ErrorS(err, "PreStop hook failed", "pod", klog.KObj(pod), "podUID", pod.UID,
   663  				"containerName", containerSpec.Name, "containerID", containerID.String())
   664  			// do not record the message in the event so that secrets won't leak from the server.
   665  			m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, "PreStopHook failed")
   666  		}
   667  	}()
   668  
   669  	select {
   670  	case <-time.After(time.Duration(gracePeriod) * time.Second):
   671  		klog.V(2).InfoS("PreStop hook not completed in grace period", "pod", klog.KObj(pod), "podUID", pod.UID,
   672  			"containerName", containerSpec.Name, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   673  	case <-done:
   674  		klog.V(3).InfoS("PreStop hook completed", "pod", klog.KObj(pod), "podUID", pod.UID,
   675  			"containerName", containerSpec.Name, "containerID", containerID.String())
   676  	}
   677  
   678  	return int64(metav1.Now().Sub(start.Time).Seconds())
   679  }
   680  
   681  // restoreSpecsFromContainerLabels restores all information needed for killing a container. In some
   682  // case we may not have pod and container spec when killing a container, e.g. pod is deleted during
   683  // kubelet restart.
   684  // To solve this problem, we've already written necessary information into container labels. Here we
   685  // just need to retrieve them from container labels and restore the specs.
   686  // TODO(random-liu): Add a node e2e test to test this behaviour.
   687  // TODO(random-liu): Change the lifecycle handler to just accept information needed, so that we can
   688  // just pass the needed function not create the fake object.
   689  func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(ctx context.Context, containerID kubecontainer.ContainerID) (*v1.Pod, *v1.Container, error) {
   690  	var pod *v1.Pod
   691  	var container *v1.Container
   692  	resp, err := m.runtimeService.ContainerStatus(ctx, containerID.ID, false)
   693  	if err != nil {
   694  		return nil, nil, err
   695  	}
   696  	s := resp.GetStatus()
   697  	if s == nil {
   698  		return nil, nil, remote.ErrContainerStatusNil
   699  	}
   700  
   701  	l := getContainerInfoFromLabels(s.Labels)
   702  	a := getContainerInfoFromAnnotations(s.Annotations)
   703  	// Notice that the followings are not full spec. The container killing code should not use
   704  	// un-restored fields.
   705  	pod = &v1.Pod{
   706  		ObjectMeta: metav1.ObjectMeta{
   707  			UID:                        l.PodUID,
   708  			Name:                       l.PodName,
   709  			Namespace:                  l.PodNamespace,
   710  			DeletionGracePeriodSeconds: a.PodDeletionGracePeriod,
   711  		},
   712  		Spec: v1.PodSpec{
   713  			TerminationGracePeriodSeconds: a.PodTerminationGracePeriod,
   714  		},
   715  	}
   716  	container = &v1.Container{
   717  		Name:                   l.ContainerName,
   718  		Ports:                  a.ContainerPorts,
   719  		TerminationMessagePath: a.TerminationMessagePath,
   720  	}
   721  	if a.PreStopHandler != nil {
   722  		container.Lifecycle = &v1.Lifecycle{
   723  			PreStop: a.PreStopHandler,
   724  		}
   725  	}
   726  	return pod, container, nil
   727  }
   728  
   729  // killContainer kills a container through the following steps:
   730  // * Run the pre-stop lifecycle hooks (if applicable).
   731  // * Stop the container.
   732  func (m *kubeGenericRuntimeManager) killContainer(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, reason containerKillReason, gracePeriodOverride *int64, ordering *terminationOrdering) error {
   733  	var containerSpec *v1.Container
   734  	if pod != nil {
   735  		if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil {
   736  			return fmt.Errorf("failed to get containerSpec %q (id=%q) in pod %q when killing container for reason %q",
   737  				containerName, containerID.String(), format.Pod(pod), message)
   738  		}
   739  	} else {
   740  		// Restore necessary information if one of the specs is nil.
   741  		restoredPod, restoredContainer, err := m.restoreSpecsFromContainerLabels(ctx, containerID)
   742  		if err != nil {
   743  			return err
   744  		}
   745  		pod, containerSpec = restoredPod, restoredContainer
   746  	}
   747  
   748  	// From this point, pod and container must be non-nil.
   749  	gracePeriod := setTerminationGracePeriod(pod, containerSpec, containerName, containerID, reason)
   750  
   751  	if len(message) == 0 {
   752  		message = fmt.Sprintf("Stopping container %s", containerSpec.Name)
   753  	}
   754  	m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message)
   755  
   756  	if gracePeriodOverride != nil {
   757  		gracePeriod = *gracePeriodOverride
   758  		klog.V(3).InfoS("Killing container with a grace period override", "pod", klog.KObj(pod), "podUID", pod.UID,
   759  			"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   760  	}
   761  
   762  	// Run the pre-stop lifecycle hooks if applicable and if there is enough time to run it
   763  	if containerSpec.Lifecycle != nil && containerSpec.Lifecycle.PreStop != nil && gracePeriod > 0 {
   764  		gracePeriod = gracePeriod - m.executePreStopHook(ctx, pod, containerID, containerSpec, gracePeriod)
   765  	}
   766  
   767  	// if we care about termination ordering, then wait for this container's turn to exit if there is
   768  	// time remaining
   769  	if ordering != nil && gracePeriod > 0 {
   770  		// grace period is only in seconds, so the time we've waited gets truncated downward
   771  		gracePeriod -= int64(ordering.waitForTurn(containerName, gracePeriod))
   772  	}
   773  
   774  	// always give containers a minimal shutdown window to avoid unnecessary SIGKILLs
   775  	if gracePeriod < minimumGracePeriodInSeconds {
   776  		gracePeriod = minimumGracePeriodInSeconds
   777  	}
   778  
   779  	klog.V(2).InfoS("Killing container with a grace period", "pod", klog.KObj(pod), "podUID", pod.UID,
   780  		"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   781  
   782  	err := m.runtimeService.StopContainer(ctx, containerID.ID, gracePeriod)
   783  	if err != nil && !crierror.IsNotFound(err) {
   784  		klog.ErrorS(err, "Container termination failed with gracePeriod", "pod", klog.KObj(pod), "podUID", pod.UID,
   785  			"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   786  		return err
   787  	}
   788  	klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID,
   789  		"containerName", containerName, "containerID", containerID.String())
   790  
   791  	if ordering != nil {
   792  		ordering.containerTerminated(containerName)
   793  	}
   794  
   795  	return nil
   796  }
   797  
   798  // killContainersWithSyncResult kills all pod's containers with sync results.
   799  func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(ctx context.Context, pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (syncResults []*kubecontainer.SyncResult) {
   800  	containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
   801  	wg := sync.WaitGroup{}
   802  
   803  	wg.Add(len(runningPod.Containers))
   804  	var termOrdering *terminationOrdering
   805  	// we only care about container termination ordering if the sidecars feature is enabled
   806  	if utilfeature.DefaultFeatureGate.Enabled(features.SidecarContainers) {
   807  		var runningContainerNames []string
   808  		for _, container := range runningPod.Containers {
   809  			runningContainerNames = append(runningContainerNames, container.Name)
   810  		}
   811  		termOrdering = newTerminationOrdering(pod, runningContainerNames)
   812  	}
   813  	for _, container := range runningPod.Containers {
   814  		go func(container *kubecontainer.Container) {
   815  			defer utilruntime.HandleCrash()
   816  			defer wg.Done()
   817  
   818  			killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
   819  			if err := m.killContainer(ctx, pod, container.ID, container.Name, "", reasonUnknown, gracePeriodOverride, termOrdering); err != nil {
   820  				killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
   821  				// Use runningPod for logging as the pod passed in could be *nil*.
   822  				klog.ErrorS(err, "Kill container failed", "pod", klog.KRef(runningPod.Namespace, runningPod.Name), "podUID", runningPod.ID,
   823  					"containerName", container.Name, "containerID", container.ID)
   824  			}
   825  			containerResults <- killContainerResult
   826  		}(container)
   827  	}
   828  	wg.Wait()
   829  	close(containerResults)
   830  
   831  	for containerResult := range containerResults {
   832  		syncResults = append(syncResults, containerResult)
   833  	}
   834  	return
   835  }
   836  
   837  // pruneInitContainersBeforeStart ensures that before we begin creating init
   838  // containers, we have reduced the number of outstanding init containers still
   839  // present. This reduces load on the container garbage collector by only
   840  // preserving the most recent terminated init container.
   841  func (m *kubeGenericRuntimeManager) pruneInitContainersBeforeStart(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) {
   842  	// only the last execution of each init container should be preserved, and only preserve it if it is in the
   843  	// list of init containers to keep.
   844  	initContainerNames := sets.NewString()
   845  	for _, container := range pod.Spec.InitContainers {
   846  		initContainerNames.Insert(container.Name)
   847  	}
   848  	for name := range initContainerNames {
   849  		count := 0
   850  		for _, status := range podStatus.ContainerStatuses {
   851  			if status.Name != name ||
   852  				(status.State != kubecontainer.ContainerStateExited &&
   853  					status.State != kubecontainer.ContainerStateUnknown) {
   854  				continue
   855  			}
   856  			// Remove init containers in unknown state. It should have
   857  			// been stopped before pruneInitContainersBeforeStart is
   858  			// called.
   859  			count++
   860  			// keep the first init container for this name
   861  			if count == 1 {
   862  				continue
   863  			}
   864  			// prune all other init containers that match this container name
   865  			klog.V(4).InfoS("Removing init container", "containerName", status.Name, "containerID", status.ID.ID, "count", count)
   866  			if err := m.removeContainer(ctx, status.ID.ID); err != nil {
   867  				utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod)))
   868  				continue
   869  			}
   870  		}
   871  	}
   872  }
   873  
   874  // Remove all init containers. Note that this function does not check the state
   875  // of the container because it assumes all init containers have been stopped
   876  // before the call happens.
   877  func (m *kubeGenericRuntimeManager) purgeInitContainers(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) {
   878  	initContainerNames := sets.NewString()
   879  	for _, container := range pod.Spec.InitContainers {
   880  		initContainerNames.Insert(container.Name)
   881  	}
   882  	for name := range initContainerNames {
   883  		count := 0
   884  		for _, status := range podStatus.ContainerStatuses {
   885  			if status.Name != name {
   886  				continue
   887  			}
   888  			count++
   889  			// Purge all init containers that match this container name
   890  			klog.V(4).InfoS("Removing init container", "containerName", status.Name, "containerID", status.ID.ID, "count", count)
   891  			if err := m.removeContainer(ctx, status.ID.ID); err != nil {
   892  				utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod)))
   893  				continue
   894  			}
   895  		}
   896  	}
   897  }
   898  
   899  // findNextInitContainerToRun returns the status of the last failed container, the
   900  // index of next init container to start, or done if there are no further init containers.
   901  // Status is only returned if an init container is failed, in which case next will
   902  // point to the current container.
   903  func findNextInitContainerToRun(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (status *kubecontainer.Status, next *v1.Container, done bool) {
   904  	if len(pod.Spec.InitContainers) == 0 {
   905  		return nil, nil, true
   906  	}
   907  
   908  	// If any of the main containers have status and are Running, then all init containers must
   909  	// have been executed at some point in the past.  However, they could have been removed
   910  	// from the container runtime now, and if we proceed, it would appear as if they
   911  	// never ran and will re-execute improperly.
   912  	for i := range pod.Spec.Containers {
   913  		container := &pod.Spec.Containers[i]
   914  		status := podStatus.FindContainerStatusByName(container.Name)
   915  		if status != nil && status.State == kubecontainer.ContainerStateRunning {
   916  			return nil, nil, true
   917  		}
   918  	}
   919  
   920  	// If there are failed containers, return the status of the last failed one.
   921  	for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
   922  		container := &pod.Spec.InitContainers[i]
   923  		status := podStatus.FindContainerStatusByName(container.Name)
   924  		if status != nil && isInitContainerFailed(status) {
   925  			return status, container, false
   926  		}
   927  	}
   928  
   929  	// There are no failed containers now.
   930  	for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
   931  		container := &pod.Spec.InitContainers[i]
   932  		status := podStatus.FindContainerStatusByName(container.Name)
   933  		if status == nil {
   934  			continue
   935  		}
   936  
   937  		// container is still running, return not done.
   938  		if status.State == kubecontainer.ContainerStateRunning {
   939  			return nil, nil, false
   940  		}
   941  
   942  		if status.State == kubecontainer.ContainerStateExited {
   943  			// all init containers successful
   944  			if i == (len(pod.Spec.InitContainers) - 1) {
   945  				return nil, nil, true
   946  			}
   947  
   948  			// all containers up to i successful, go to i+1
   949  			return nil, &pod.Spec.InitContainers[i+1], false
   950  		}
   951  	}
   952  
   953  	return nil, &pod.Spec.InitContainers[0], false
   954  }
   955  
   956  // hasAnyRegularContainerCreated returns true if any regular container has been
   957  // created, which indicates all init containers have been initialized.
   958  func hasAnyRegularContainerCreated(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool {
   959  	for _, container := range pod.Spec.Containers {
   960  		status := podStatus.FindContainerStatusByName(container.Name)
   961  		if status == nil {
   962  			continue
   963  		}
   964  		switch status.State {
   965  		case kubecontainer.ContainerStateCreated,
   966  			kubecontainer.ContainerStateRunning,
   967  			kubecontainer.ContainerStateExited:
   968  			return true
   969  		default:
   970  			// Ignore other states
   971  		}
   972  	}
   973  	return false
   974  }
   975  
   976  // computeInitContainerActions sets the actions on the given changes that need
   977  // to be taken for the init containers. This includes actions to initialize the
   978  // init containers and actions to keep restartable init containers running.
   979  // computeInitContainerActions returns true if pod has been initialized.
   980  //
   981  // The actions include:
   982  // - Start the first init container that has not been started.
   983  // - Restart all restartable init containers that have started but are not running.
   984  // - Kill the restartable init containers that are not alive or started.
   985  //
   986  // Note that this is a function for the SidecarContainers feature.
   987  // Please sync with the findNextInitContainerToRun function if any changes are
   988  // made, as either this or that function will be called.
   989  func (m *kubeGenericRuntimeManager) computeInitContainerActions(pod *v1.Pod, podStatus *kubecontainer.PodStatus, changes *podActions) bool {
   990  	if len(pod.Spec.InitContainers) == 0 {
   991  		return true
   992  	}
   993  
   994  	// If any of the main containers have status and are Running, then all init containers must
   995  	// have been executed at some point in the past.  However, they could have been removed
   996  	// from the container runtime now, and if we proceed, it would appear as if they
   997  	// never ran and will re-execute improperly except for the restartable init containers.
   998  	podHasInitialized := false
   999  	for _, container := range pod.Spec.Containers {
  1000  		status := podStatus.FindContainerStatusByName(container.Name)
  1001  		if status == nil {
  1002  			continue
  1003  		}
  1004  		switch status.State {
  1005  		case kubecontainer.ContainerStateCreated,
  1006  			kubecontainer.ContainerStateRunning:
  1007  			podHasInitialized = true
  1008  		case kubecontainer.ContainerStateExited:
  1009  			// This is a workaround for the issue that the kubelet cannot
  1010  			// differentiate the container statuses of the previous podSandbox
  1011  			// from the current one.
  1012  			// If the node is rebooted, all containers will be in the exited
  1013  			// state and the kubelet will try to recreate a new podSandbox.
  1014  			// In this case, the kubelet should not mistakenly think that
  1015  			// the newly created podSandbox has been initialized.
  1016  		default:
  1017  			// Ignore other states
  1018  		}
  1019  		if podHasInitialized {
  1020  			break
  1021  		}
  1022  	}
  1023  
  1024  	// isPreviouslyInitialized indicates if the current init container is
  1025  	// previously initialized.
  1026  	isPreviouslyInitialized := podHasInitialized
  1027  	restartOnFailure := shouldRestartOnFailure(pod)
  1028  
  1029  	// Note that we iterate through the init containers in reverse order to find
  1030  	// the next init container to run, as the completed init containers may get
  1031  	// removed from container runtime for various reasons. Therefore the kubelet
  1032  	// should rely on the minimal number of init containers - the last one.
  1033  	//
  1034  	// Once we find the next init container to run, iterate through the rest to
  1035  	// find the restartable init containers to restart.
  1036  	for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
  1037  		container := &pod.Spec.InitContainers[i]
  1038  		status := podStatus.FindContainerStatusByName(container.Name)
  1039  		klog.V(4).InfoS("Computing init container action", "pod", klog.KObj(pod), "container", container.Name, "status", status)
  1040  		if status == nil {
  1041  			// If the container is previously initialized but its status is not
  1042  			// found, it means its last status is removed for some reason.
  1043  			// Restart it if it is a restartable init container.
  1044  			if isPreviouslyInitialized && types.IsRestartableInitContainer(container) {
  1045  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1046  			}
  1047  			continue
  1048  		}
  1049  
  1050  		if isPreviouslyInitialized && !types.IsRestartableInitContainer(container) {
  1051  			// after initialization, only restartable init containers need to be kept
  1052  			// running
  1053  			continue
  1054  		}
  1055  
  1056  		switch status.State {
  1057  		case kubecontainer.ContainerStateCreated:
  1058  			// nothing to do but wait for it to start
  1059  
  1060  		case kubecontainer.ContainerStateRunning:
  1061  			if !types.IsRestartableInitContainer(container) {
  1062  				break
  1063  			}
  1064  
  1065  			if types.IsRestartableInitContainer(container) {
  1066  				if container.StartupProbe != nil {
  1067  					startup, found := m.startupManager.Get(status.ID)
  1068  					if !found {
  1069  						// If the startup probe has not been run, wait for it.
  1070  						break
  1071  					}
  1072  					if startup != proberesults.Success {
  1073  						if startup == proberesults.Failure {
  1074  							// If the restartable init container failed the startup probe,
  1075  							// restart it.
  1076  							changes.ContainersToKill[status.ID] = containerToKillInfo{
  1077  								name:      container.Name,
  1078  								container: container,
  1079  								message:   fmt.Sprintf("Init container %s failed startup probe", container.Name),
  1080  								reason:    reasonStartupProbe,
  1081  							}
  1082  							changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1083  						}
  1084  						break
  1085  					}
  1086  				}
  1087  
  1088  				klog.V(4).InfoS("Init container has been initialized", "pod", klog.KObj(pod), "container", container.Name)
  1089  				if i == (len(pod.Spec.InitContainers) - 1) {
  1090  					podHasInitialized = true
  1091  				} else if !isPreviouslyInitialized {
  1092  					// this init container is initialized for the first time, start the next one
  1093  					changes.InitContainersToStart = append(changes.InitContainersToStart, i+1)
  1094  				}
  1095  
  1096  				// A restartable init container does not have to take into account its
  1097  				// liveness probe when it determines to start the next init container.
  1098  				if container.LivenessProbe != nil {
  1099  					liveness, found := m.livenessManager.Get(status.ID)
  1100  					if !found {
  1101  						// If the liveness probe has not been run, wait for it.
  1102  						break
  1103  					}
  1104  					if liveness == proberesults.Failure {
  1105  						// If the restartable init container failed the liveness probe,
  1106  						// restart it.
  1107  						changes.ContainersToKill[status.ID] = containerToKillInfo{
  1108  							name:      container.Name,
  1109  							container: container,
  1110  							message:   fmt.Sprintf("Init container %s failed liveness probe", container.Name),
  1111  							reason:    reasonLivenessProbe,
  1112  						}
  1113  						changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1114  					}
  1115  				}
  1116  			} else { // init container
  1117  				// nothing do to but wait for it to finish
  1118  				break
  1119  			}
  1120  
  1121  		// If the init container failed and the restart policy is Never, the pod is terminal.
  1122  		// Otherwise, restart the init container.
  1123  		case kubecontainer.ContainerStateExited:
  1124  			if types.IsRestartableInitContainer(container) {
  1125  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1126  			} else { // init container
  1127  				if isInitContainerFailed(status) {
  1128  					if !restartOnFailure {
  1129  						changes.KillPod = true
  1130  						changes.InitContainersToStart = nil
  1131  						return false
  1132  					}
  1133  					changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1134  					break
  1135  				}
  1136  
  1137  				klog.V(4).InfoS("Init container has been initialized", "pod", klog.KObj(pod), "container", container.Name)
  1138  				if i == (len(pod.Spec.InitContainers) - 1) {
  1139  					podHasInitialized = true
  1140  				} else {
  1141  					// this init container is initialized for the first time, start the next one
  1142  					changes.InitContainersToStart = append(changes.InitContainersToStart, i+1)
  1143  				}
  1144  			}
  1145  
  1146  		default: // kubecontainer.ContainerStatusUnknown or other unknown states
  1147  			if types.IsRestartableInitContainer(container) {
  1148  				// If the restartable init container is in unknown state, restart it.
  1149  				changes.ContainersToKill[status.ID] = containerToKillInfo{
  1150  					name:      container.Name,
  1151  					container: container,
  1152  					message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
  1153  						status.State),
  1154  					reason: reasonUnknown,
  1155  				}
  1156  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1157  			} else { // init container
  1158  				if !isInitContainerFailed(status) {
  1159  					klog.V(4).InfoS("This should not happen, init container is in unknown state but not failed", "pod", klog.KObj(pod), "containerStatus", status)
  1160  				}
  1161  
  1162  				if !restartOnFailure {
  1163  					changes.KillPod = true
  1164  					changes.InitContainersToStart = nil
  1165  					return false
  1166  				}
  1167  
  1168  				// If the init container is in unknown state, restart it.
  1169  				changes.ContainersToKill[status.ID] = containerToKillInfo{
  1170  					name:      container.Name,
  1171  					container: container,
  1172  					message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
  1173  						status.State),
  1174  					reason: reasonUnknown,
  1175  				}
  1176  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1177  			}
  1178  		}
  1179  
  1180  		if !isPreviouslyInitialized {
  1181  			// the one before this init container has been initialized
  1182  			isPreviouslyInitialized = true
  1183  		}
  1184  	}
  1185  
  1186  	// this means no init containers have been started,
  1187  	// start the first one
  1188  	if !isPreviouslyInitialized {
  1189  		changes.InitContainersToStart = append(changes.InitContainersToStart, 0)
  1190  	}
  1191  
  1192  	// reverse the InitContainersToStart, as the above loop iterated through the
  1193  	// init containers backwards, but we want to start them as per the order in
  1194  	// the pod spec.
  1195  	l := len(changes.InitContainersToStart)
  1196  	for i := 0; i < l/2; i++ {
  1197  		changes.InitContainersToStart[i], changes.InitContainersToStart[l-1-i] =
  1198  			changes.InitContainersToStart[l-1-i], changes.InitContainersToStart[i]
  1199  	}
  1200  
  1201  	return podHasInitialized
  1202  }
  1203  
  1204  // GetContainerLogs returns logs of a specific container.
  1205  func (m *kubeGenericRuntimeManager) GetContainerLogs(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) (err error) {
  1206  	resp, err := m.runtimeService.ContainerStatus(ctx, containerID.ID, false)
  1207  	if err != nil {
  1208  		klog.V(4).InfoS("Failed to get container status", "containerID", containerID.String(), "err", err)
  1209  		return fmt.Errorf("unable to retrieve container logs for %v", containerID.String())
  1210  	}
  1211  	status := resp.GetStatus()
  1212  	if status == nil {
  1213  		return remote.ErrContainerStatusNil
  1214  	}
  1215  	return m.ReadLogs(ctx, status.GetLogPath(), containerID.ID, logOptions, stdout, stderr)
  1216  }
  1217  
  1218  // GetExec gets the endpoint the runtime will serve the exec request from.
  1219  func (m *kubeGenericRuntimeManager) GetExec(ctx context.Context, id kubecontainer.ContainerID, cmd []string, stdin, stdout, stderr, tty bool) (*url.URL, error) {
  1220  	req := &runtimeapi.ExecRequest{
  1221  		ContainerId: id.ID,
  1222  		Cmd:         cmd,
  1223  		Tty:         tty,
  1224  		Stdin:       stdin,
  1225  		Stdout:      stdout,
  1226  		Stderr:      stderr,
  1227  	}
  1228  	resp, err := m.runtimeService.Exec(ctx, req)
  1229  	if err != nil {
  1230  		return nil, err
  1231  	}
  1232  
  1233  	return url.Parse(resp.Url)
  1234  }
  1235  
  1236  // GetAttach gets the endpoint the runtime will serve the attach request from.
  1237  func (m *kubeGenericRuntimeManager) GetAttach(ctx context.Context, id kubecontainer.ContainerID, stdin, stdout, stderr, tty bool) (*url.URL, error) {
  1238  	req := &runtimeapi.AttachRequest{
  1239  		ContainerId: id.ID,
  1240  		Stdin:       stdin,
  1241  		Stdout:      stdout,
  1242  		Stderr:      stderr,
  1243  		Tty:         tty,
  1244  	}
  1245  	resp, err := m.runtimeService.Attach(ctx, req)
  1246  	if err != nil {
  1247  		return nil, err
  1248  	}
  1249  	return url.Parse(resp.Url)
  1250  }
  1251  
  1252  // RunInContainer synchronously executes the command in the container, and returns the output.
  1253  func (m *kubeGenericRuntimeManager) RunInContainer(ctx context.Context, id kubecontainer.ContainerID, cmd []string, timeout time.Duration) ([]byte, error) {
  1254  	stdout, stderr, err := m.runtimeService.ExecSync(ctx, id.ID, cmd, timeout)
  1255  	// NOTE(tallclair): This does not correctly interleave stdout & stderr, but should be sufficient
  1256  	// for logging purposes. A combined output option will need to be added to the ExecSyncRequest
  1257  	// if more precise output ordering is ever required.
  1258  	return append(stdout, stderr...), err
  1259  }
  1260  
  1261  // removeContainer removes the container and the container logs.
  1262  // Notice that we remove the container logs first, so that container will not be removed if
  1263  // container logs are failed to be removed, and kubelet will retry this later. This guarantees
  1264  // that container logs to be removed with the container.
  1265  // Notice that we assume that the container should only be removed in non-running state, and
  1266  // it will not write container logs anymore in that state.
  1267  func (m *kubeGenericRuntimeManager) removeContainer(ctx context.Context, containerID string) error {
  1268  	klog.V(4).InfoS("Removing container", "containerID", containerID)
  1269  	// Call internal container post-stop lifecycle hook.
  1270  	if err := m.internalLifecycle.PostStopContainer(containerID); err != nil {
  1271  		return err
  1272  	}
  1273  
  1274  	// Remove the container log.
  1275  	// TODO: Separate log and container lifecycle management.
  1276  	if err := m.removeContainerLog(ctx, containerID); err != nil {
  1277  		return err
  1278  	}
  1279  	// Remove the container.
  1280  	return m.runtimeService.RemoveContainer(ctx, containerID)
  1281  }
  1282  
  1283  // removeContainerLog removes the container log.
  1284  func (m *kubeGenericRuntimeManager) removeContainerLog(ctx context.Context, containerID string) error {
  1285  	// Use log manager to remove rotated logs.
  1286  	err := m.logManager.Clean(ctx, containerID)
  1287  	if err != nil {
  1288  		return err
  1289  	}
  1290  
  1291  	resp, err := m.runtimeService.ContainerStatus(ctx, containerID, false)
  1292  	if err != nil {
  1293  		return fmt.Errorf("failed to get container status %q: %v", containerID, err)
  1294  	}
  1295  	status := resp.GetStatus()
  1296  	if status == nil {
  1297  		return remote.ErrContainerStatusNil
  1298  	}
  1299  	// Remove the legacy container log symlink.
  1300  	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
  1301  	labeledInfo := getContainerInfoFromLabels(status.Labels)
  1302  	legacySymlink := legacyLogSymlink(containerID, labeledInfo.ContainerName, labeledInfo.PodName,
  1303  		labeledInfo.PodNamespace)
  1304  	if err := m.osInterface.Remove(legacySymlink); err != nil && !os.IsNotExist(err) {
  1305  		return fmt.Errorf("failed to remove container %q log legacy symbolic link %q: %v",
  1306  			containerID, legacySymlink, err)
  1307  	}
  1308  	return nil
  1309  }
  1310  
  1311  // DeleteContainer removes a container.
  1312  func (m *kubeGenericRuntimeManager) DeleteContainer(ctx context.Context, containerID kubecontainer.ContainerID) error {
  1313  	return m.removeContainer(ctx, containerID.ID)
  1314  }
  1315  
  1316  // setTerminationGracePeriod determines the grace period to use when killing a container
  1317  func setTerminationGracePeriod(pod *v1.Pod, containerSpec *v1.Container, containerName string, containerID kubecontainer.ContainerID, reason containerKillReason) int64 {
  1318  	gracePeriod := int64(minimumGracePeriodInSeconds)
  1319  	switch {
  1320  	case pod.DeletionGracePeriodSeconds != nil:
  1321  		return *pod.DeletionGracePeriodSeconds
  1322  	case pod.Spec.TerminationGracePeriodSeconds != nil:
  1323  		switch reason {
  1324  		case reasonStartupProbe:
  1325  			if isProbeTerminationGracePeriodSecondsSet(pod, containerSpec, containerSpec.StartupProbe, containerName, containerID, "StartupProbe") {
  1326  				return *containerSpec.StartupProbe.TerminationGracePeriodSeconds
  1327  			}
  1328  		case reasonLivenessProbe:
  1329  			if isProbeTerminationGracePeriodSecondsSet(pod, containerSpec, containerSpec.LivenessProbe, containerName, containerID, "LivenessProbe") {
  1330  				return *containerSpec.LivenessProbe.TerminationGracePeriodSeconds
  1331  			}
  1332  		}
  1333  		return *pod.Spec.TerminationGracePeriodSeconds
  1334  	}
  1335  	return gracePeriod
  1336  }
  1337  
  1338  func isProbeTerminationGracePeriodSecondsSet(pod *v1.Pod, containerSpec *v1.Container, probe *v1.Probe, containerName string, containerID kubecontainer.ContainerID, probeType string) bool {
  1339  	if probe != nil && probe.TerminationGracePeriodSeconds != nil {
  1340  		if *probe.TerminationGracePeriodSeconds > *pod.Spec.TerminationGracePeriodSeconds {
  1341  			klog.V(4).InfoS("Using probe-level grace period that is greater than the pod-level grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerName, "containerID", containerID.String(), "probeType", probeType, "probeGracePeriod", *probe.TerminationGracePeriodSeconds, "podGracePeriod", *pod.Spec.TerminationGracePeriodSeconds)
  1342  		}
  1343  		return true
  1344  	}
  1345  	return false
  1346  }