k8s.io/kubernetes@v1.29.3/pkg/kubelet/kuberuntime/kuberuntime_container.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kuberuntime
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math/rand"
    25  	"net/url"
    26  	"os"
    27  	"path/filepath"
    28  	"regexp"
    29  	goruntime "runtime"
    30  	"sort"
    31  	"strconv"
    32  	"strings"
    33  	"sync"
    34  	"time"
    35  
    36  	crierror "k8s.io/cri-api/pkg/errors"
    37  
    38  	"github.com/opencontainers/selinux/go-selinux"
    39  	grpcstatus "google.golang.org/grpc/status"
    40  
    41  	"github.com/armon/circbuf"
    42  	"k8s.io/klog/v2"
    43  
    44  	v1 "k8s.io/api/core/v1"
    45  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    46  	kubetypes "k8s.io/apimachinery/pkg/types"
    47  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    48  	"k8s.io/apimachinery/pkg/util/sets"
    49  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    50  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    51  	kubelettypes "k8s.io/kubelet/pkg/types"
    52  	"k8s.io/kubernetes/pkg/features"
    53  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    54  	"k8s.io/kubernetes/pkg/kubelet/cri/remote"
    55  	"k8s.io/kubernetes/pkg/kubelet/events"
    56  	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
    57  	"k8s.io/kubernetes/pkg/kubelet/types"
    58  	"k8s.io/kubernetes/pkg/kubelet/util/format"
    59  	"k8s.io/kubernetes/pkg/util/tail"
    60  	volumeutil "k8s.io/kubernetes/pkg/volume/util"
    61  )
    62  
    63  var (
    64  	// ErrCreateContainerConfig - failed to create container config
    65  	ErrCreateContainerConfig = errors.New("CreateContainerConfigError")
    66  	// ErrPreCreateHook - failed to execute PreCreateHook
    67  	ErrPreCreateHook = errors.New("PreCreateHookError")
    68  	// ErrCreateContainer - failed to create container
    69  	ErrCreateContainer = errors.New("CreateContainerError")
    70  	// ErrPreStartHook - failed to execute PreStartHook
    71  	ErrPreStartHook = errors.New("PreStartHookError")
    72  	// ErrPostStartHook - failed to execute PostStartHook
    73  	ErrPostStartHook = errors.New("PostStartHookError")
    74  )
    75  
    76  // recordContainerEvent should be used by the runtime manager for all container related events.
    77  // it has sanity checks to ensure that we do not write events that can abuse our masters.
    78  // in particular, it ensures that a containerID never appears in an event message as that
    79  // is prone to causing a lot of distinct events that do not count well.
    80  // it replaces any reference to a containerID with the containerName which is stable, and is what users know.
    81  func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) {
    82  	ref, err := kubecontainer.GenerateContainerRef(pod, container)
    83  	if err != nil {
    84  		klog.ErrorS(err, "Can't make a container ref", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name)
    85  		return
    86  	}
    87  	eventMessage := message
    88  	if len(args) > 0 {
    89  		eventMessage = fmt.Sprintf(message, args...)
    90  	}
    91  	// this is a hack, but often the error from the runtime includes the containerID
    92  	// which kills our ability to deduplicate events.  this protection makes a huge
    93  	// difference in the number of unique events
    94  	if containerID != "" {
    95  		eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1)
    96  	}
    97  	m.recorder.Event(ref, eventType, reason, eventMessage)
    98  }
    99  
   100  // startSpec wraps the spec required to start a container, either a regular/init container
   101  // or an ephemeral container. Ephemeral containers contain all the fields of regular/init
   102  // containers, plus some additional fields. In both cases startSpec.container will be set.
   103  type startSpec struct {
   104  	container          *v1.Container
   105  	ephemeralContainer *v1.EphemeralContainer
   106  }
   107  
   108  func containerStartSpec(c *v1.Container) *startSpec {
   109  	return &startSpec{container: c}
   110  }
   111  
   112  func ephemeralContainerStartSpec(ec *v1.EphemeralContainer) *startSpec {
   113  	return &startSpec{
   114  		container:          (*v1.Container)(&ec.EphemeralContainerCommon),
   115  		ephemeralContainer: ec,
   116  	}
   117  }
   118  
   119  // getTargetID returns the kubecontainer.ContainerID for ephemeral container namespace
   120  // targeting. The target is stored as EphemeralContainer.TargetContainerName, which must be
   121  // resolved to a ContainerID using podStatus. The target container must already exist, which
   122  // usually isn't a problem since ephemeral containers aren't allowed at pod creation time.
   123  func (s *startSpec) getTargetID(podStatus *kubecontainer.PodStatus) (*kubecontainer.ContainerID, error) {
   124  	if s.ephemeralContainer == nil || s.ephemeralContainer.TargetContainerName == "" {
   125  		return nil, nil
   126  	}
   127  
   128  	targetStatus := podStatus.FindContainerStatusByName(s.ephemeralContainer.TargetContainerName)
   129  	if targetStatus == nil {
   130  		return nil, fmt.Errorf("unable to find target container %v", s.ephemeralContainer.TargetContainerName)
   131  	}
   132  
   133  	return &targetStatus.ID, nil
   134  }
   135  
   136  func calcRestartCountByLogDir(path string) (int, error) {
   137  	// if the path doesn't exist then it's not an error
   138  	if _, err := os.Stat(path); err != nil {
   139  		return 0, nil
   140  	}
   141  	files, err := os.ReadDir(path)
   142  	if err != nil {
   143  		return 0, err
   144  	}
   145  	if len(files) == 0 {
   146  		return 0, nil
   147  	}
   148  	restartCount := 0
   149  	restartCountLogFileRegex := regexp.MustCompile(`^(\d+)\.log(\..*)?`)
   150  	for _, file := range files {
   151  		if file.IsDir() {
   152  			continue
   153  		}
   154  		matches := restartCountLogFileRegex.FindStringSubmatch(file.Name())
   155  		if len(matches) == 0 {
   156  			continue
   157  		}
   158  		count, err := strconv.Atoi(matches[1])
   159  		if err != nil {
   160  			// unlikely kubelet created this file,
   161  			// likely custom file with random numbers as a name
   162  			continue
   163  		}
   164  		count++
   165  		if count > restartCount {
   166  			restartCount = count
   167  		}
   168  	}
   169  	return restartCount, nil
   170  }
   171  
   172  // startContainer starts a container and returns a message indicates why it is failed on error.
   173  // It starts the container through the following steps:
   174  // * pull the image
   175  // * create the container
   176  // * start the container
   177  // * run the post start lifecycle hooks (if applicable)
   178  func (m *kubeGenericRuntimeManager) startContainer(ctx context.Context, podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) {
   179  	container := spec.container
   180  
   181  	// Step 1: pull the image.
   182  
   183  	// If RuntimeClassInImageCriAPI feature gate is enabled, pass runtimehandler
   184  	// information for the runtime class specified. If not runtime class is
   185  	// specified, then pass ""
   186  	podRuntimeHandler := ""
   187  	var err error
   188  	if utilfeature.DefaultFeatureGate.Enabled(features.RuntimeClassInImageCriAPI) {
   189  		if pod.Spec.RuntimeClassName != nil && *pod.Spec.RuntimeClassName != "" {
   190  			podRuntimeHandler, err = m.runtimeClassManager.LookupRuntimeHandler(pod.Spec.RuntimeClassName)
   191  			if err != nil {
   192  				msg := fmt.Sprintf("Failed to lookup runtimeHandler for runtimeClassName %v", pod.Spec.RuntimeClassName)
   193  				return msg, err
   194  			}
   195  		}
   196  	}
   197  
   198  	imageRef, msg, err := m.imagePuller.EnsureImageExists(ctx, pod, container, pullSecrets, podSandboxConfig, podRuntimeHandler)
   199  	if err != nil {
   200  		s, _ := grpcstatus.FromError(err)
   201  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   202  		return msg, err
   203  	}
   204  
   205  	// Step 2: create the container.
   206  	// For a new container, the RestartCount should be 0
   207  	restartCount := 0
   208  	containerStatus := podStatus.FindContainerStatusByName(container.Name)
   209  	if containerStatus != nil {
   210  		restartCount = containerStatus.RestartCount + 1
   211  	} else {
   212  		// The container runtime keeps state on container statuses and
   213  		// what the container restart count is. When nodes are rebooted
   214  		// some container runtimes clear their state which causes the
   215  		// restartCount to be reset to 0. This causes the logfile to
   216  		// start at 0.log, which either overwrites or appends to the
   217  		// already existing log.
   218  		//
   219  		// We are checking to see if the log directory exists, and find
   220  		// the latest restartCount by checking the log name -
   221  		// {restartCount}.log - and adding 1 to it.
   222  		logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
   223  		restartCount, err = calcRestartCountByLogDir(logDir)
   224  		if err != nil {
   225  			klog.InfoS("Cannot calculate restartCount from the log directory", "logDir", logDir, "err", err)
   226  			restartCount = 0
   227  		}
   228  	}
   229  
   230  	target, err := spec.getTargetID(podStatus)
   231  	if err != nil {
   232  		s, _ := grpcstatus.FromError(err)
   233  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   234  		return s.Message(), ErrCreateContainerConfig
   235  	}
   236  
   237  	containerConfig, cleanupAction, err := m.generateContainerConfig(ctx, container, pod, restartCount, podIP, imageRef, podIPs, target)
   238  	if cleanupAction != nil {
   239  		defer cleanupAction()
   240  	}
   241  	if err != nil {
   242  		s, _ := grpcstatus.FromError(err)
   243  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   244  		return s.Message(), ErrCreateContainerConfig
   245  	}
   246  
   247  	err = m.internalLifecycle.PreCreateContainer(pod, container, containerConfig)
   248  	if err != nil {
   249  		s, _ := grpcstatus.FromError(err)
   250  		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Internal PreCreateContainer hook failed: %v", s.Message())
   251  		return s.Message(), ErrPreCreateHook
   252  	}
   253  
   254  	containerID, err := m.runtimeService.CreateContainer(ctx, podSandboxID, containerConfig, podSandboxConfig)
   255  	if err != nil {
   256  		s, _ := grpcstatus.FromError(err)
   257  		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
   258  		return s.Message(), ErrCreateContainer
   259  	}
   260  	err = m.internalLifecycle.PreStartContainer(pod, container, containerID)
   261  	if err != nil {
   262  		s, _ := grpcstatus.FromError(err)
   263  		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Internal PreStartContainer hook failed: %v", s.Message())
   264  		return s.Message(), ErrPreStartHook
   265  	}
   266  	m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, fmt.Sprintf("Created container %s", container.Name))
   267  
   268  	// Step 3: start the container.
   269  	err = m.runtimeService.StartContainer(ctx, containerID)
   270  	if err != nil {
   271  		s, _ := grpcstatus.FromError(err)
   272  		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", s.Message())
   273  		return s.Message(), kubecontainer.ErrRunContainer
   274  	}
   275  	m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, fmt.Sprintf("Started container %s", container.Name))
   276  
   277  	// Symlink container logs to the legacy container log location for cluster logging
   278  	// support.
   279  	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
   280  	containerMeta := containerConfig.GetMetadata()
   281  	sandboxMeta := podSandboxConfig.GetMetadata()
   282  	legacySymlink := legacyLogSymlink(containerID, containerMeta.Name, sandboxMeta.Name,
   283  		sandboxMeta.Namespace)
   284  	containerLog := filepath.Join(podSandboxConfig.LogDirectory, containerConfig.LogPath)
   285  	// only create legacy symlink if containerLog path exists (or the error is not IsNotExist).
   286  	// Because if containerLog path does not exist, only dangling legacySymlink is created.
   287  	// This dangling legacySymlink is later removed by container gc, so it does not make sense
   288  	// to create it in the first place. it happens when journald logging driver is used with docker.
   289  	if _, err := m.osInterface.Stat(containerLog); !os.IsNotExist(err) {
   290  		if err := m.osInterface.Symlink(containerLog, legacySymlink); err != nil {
   291  			klog.ErrorS(err, "Failed to create legacy symbolic link", "path", legacySymlink,
   292  				"containerID", containerID, "containerLogPath", containerLog)
   293  		}
   294  	}
   295  
   296  	// Step 4: execute the post start hook.
   297  	if container.Lifecycle != nil && container.Lifecycle.PostStart != nil {
   298  		kubeContainerID := kubecontainer.ContainerID{
   299  			Type: m.runtimeName,
   300  			ID:   containerID,
   301  		}
   302  		msg, handlerErr := m.runner.Run(ctx, kubeContainerID, pod, container, container.Lifecycle.PostStart)
   303  		if handlerErr != nil {
   304  			klog.ErrorS(handlerErr, "Failed to execute PostStartHook", "pod", klog.KObj(pod),
   305  				"podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
   306  			// do not record the message in the event so that secrets won't leak from the server.
   307  			m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, "PostStartHook failed")
   308  			if err := m.killContainer(ctx, pod, kubeContainerID, container.Name, "FailedPostStartHook", reasonFailedPostStartHook, nil, nil); err != nil {
   309  				klog.ErrorS(err, "Failed to kill container", "pod", klog.KObj(pod),
   310  					"podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
   311  			}
   312  			return msg, ErrPostStartHook
   313  		}
   314  	}
   315  
   316  	return "", nil
   317  }
   318  
   319  // generateContainerConfig generates container config for kubelet runtime v1.
   320  func (m *kubeGenericRuntimeManager) generateContainerConfig(ctx context.Context, container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string, podIPs []string, nsTarget *kubecontainer.ContainerID) (*runtimeapi.ContainerConfig, func(), error) {
   321  	opts, cleanupAction, err := m.runtimeHelper.GenerateRunContainerOptions(ctx, pod, container, podIP, podIPs)
   322  	if err != nil {
   323  		return nil, nil, err
   324  	}
   325  
   326  	uid, username, err := m.getImageUser(ctx, container.Image)
   327  	if err != nil {
   328  		return nil, cleanupAction, err
   329  	}
   330  
   331  	// Verify RunAsNonRoot. Non-root verification only supports numeric user.
   332  	if err := verifyRunAsNonRoot(pod, container, uid, username); err != nil {
   333  		return nil, cleanupAction, err
   334  	}
   335  
   336  	command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs)
   337  	logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
   338  	err = m.osInterface.MkdirAll(logDir, 0755)
   339  	if err != nil {
   340  		return nil, cleanupAction, fmt.Errorf("create container log directory for container %s failed: %v", container.Name, err)
   341  	}
   342  	containerLogsPath := buildContainerLogsPath(container.Name, restartCount)
   343  	restartCountUint32 := uint32(restartCount)
   344  	config := &runtimeapi.ContainerConfig{
   345  		Metadata: &runtimeapi.ContainerMetadata{
   346  			Name:    container.Name,
   347  			Attempt: restartCountUint32,
   348  		},
   349  		Image:       &runtimeapi.ImageSpec{Image: imageRef, UserSpecifiedImage: container.Image},
   350  		Command:     command,
   351  		Args:        args,
   352  		WorkingDir:  container.WorkingDir,
   353  		Labels:      newContainerLabels(container, pod),
   354  		Annotations: newContainerAnnotations(container, pod, restartCount, opts),
   355  		Devices:     makeDevices(opts),
   356  		CDIDevices:  makeCDIDevices(opts),
   357  		Mounts:      m.makeMounts(opts, container),
   358  		LogPath:     containerLogsPath,
   359  		Stdin:       container.Stdin,
   360  		StdinOnce:   container.StdinOnce,
   361  		Tty:         container.TTY,
   362  	}
   363  
   364  	// set platform specific configurations.
   365  	if err := m.applyPlatformSpecificContainerConfig(config, container, pod, uid, username, nsTarget); err != nil {
   366  		return nil, cleanupAction, err
   367  	}
   368  
   369  	// set environment variables
   370  	envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
   371  	for idx := range opts.Envs {
   372  		e := opts.Envs[idx]
   373  		envs[idx] = &runtimeapi.KeyValue{
   374  			Key:   e.Name,
   375  			Value: e.Value,
   376  		}
   377  	}
   378  	config.Envs = envs
   379  
   380  	return config, cleanupAction, nil
   381  }
   382  
   383  func (m *kubeGenericRuntimeManager) updateContainerResources(pod *v1.Pod, container *v1.Container, containerID kubecontainer.ContainerID) error {
   384  	containerResources := m.generateContainerResources(pod, container)
   385  	if containerResources == nil {
   386  		return fmt.Errorf("container %q updateContainerResources failed: cannot generate resources config", containerID.String())
   387  	}
   388  	ctx := context.Background()
   389  	err := m.runtimeService.UpdateContainerResources(ctx, containerID.ID, containerResources)
   390  	if err != nil {
   391  		klog.ErrorS(err, "UpdateContainerResources failed", "container", containerID.String())
   392  	}
   393  	return err
   394  }
   395  
   396  // makeDevices generates container devices for kubelet runtime v1.
   397  func makeDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.Device {
   398  	devices := make([]*runtimeapi.Device, len(opts.Devices))
   399  
   400  	for idx := range opts.Devices {
   401  		device := opts.Devices[idx]
   402  		devices[idx] = &runtimeapi.Device{
   403  			HostPath:      device.PathOnHost,
   404  			ContainerPath: device.PathInContainer,
   405  			Permissions:   device.Permissions,
   406  		}
   407  	}
   408  
   409  	return devices
   410  }
   411  
   412  // makeCDIDevices generates container CDIDevices for kubelet runtime v1.
   413  func makeCDIDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.CDIDevice {
   414  	devices := make([]*runtimeapi.CDIDevice, len(opts.CDIDevices))
   415  
   416  	for i, device := range opts.CDIDevices {
   417  		devices[i] = &runtimeapi.CDIDevice{
   418  			Name: device.Name,
   419  		}
   420  	}
   421  
   422  	return devices
   423  }
   424  
   425  // makeMounts generates container volume mounts for kubelet runtime v1.
   426  func (m *kubeGenericRuntimeManager) makeMounts(opts *kubecontainer.RunContainerOptions, container *v1.Container) []*runtimeapi.Mount {
   427  	volumeMounts := []*runtimeapi.Mount{}
   428  
   429  	for idx := range opts.Mounts {
   430  		v := opts.Mounts[idx]
   431  		selinuxRelabel := v.SELinuxRelabel && selinux.GetEnabled()
   432  		mount := &runtimeapi.Mount{
   433  			HostPath:       v.HostPath,
   434  			ContainerPath:  v.ContainerPath,
   435  			Readonly:       v.ReadOnly,
   436  			SelinuxRelabel: selinuxRelabel,
   437  			Propagation:    v.Propagation,
   438  		}
   439  
   440  		volumeMounts = append(volumeMounts, mount)
   441  	}
   442  
   443  	// The reason we create and mount the log file in here (not in kubelet) is because
   444  	// the file's location depends on the ID of the container, and we need to create and
   445  	// mount the file before actually starting the container.
   446  	if opts.PodContainerDir != "" && len(container.TerminationMessagePath) != 0 {
   447  		// Because the PodContainerDir contains pod uid and container name which is unique enough,
   448  		// here we just add a random id to make the path unique for different instances
   449  		// of the same container.
   450  		cid := makeUID()
   451  		containerLogPath := filepath.Join(opts.PodContainerDir, cid)
   452  		fs, err := m.osInterface.Create(containerLogPath)
   453  		if err != nil {
   454  			utilruntime.HandleError(fmt.Errorf("error on creating termination-log file %q: %v", containerLogPath, err))
   455  		} else {
   456  			fs.Close()
   457  
   458  			// Chmod is needed because os.Create() ends up calling
   459  			// open(2) to create the file, so the final mode used is "mode &
   460  			// ~umask". But we want to make sure the specified mode is used
   461  			// in the file no matter what the umask is.
   462  			if err := m.osInterface.Chmod(containerLogPath, 0666); err != nil {
   463  				utilruntime.HandleError(fmt.Errorf("unable to set termination-log file permissions %q: %v", containerLogPath, err))
   464  			}
   465  
   466  			// Volume Mounts fail on Windows if it is not of the form C:/
   467  			containerLogPath = volumeutil.MakeAbsolutePath(goruntime.GOOS, containerLogPath)
   468  			terminationMessagePath := volumeutil.MakeAbsolutePath(goruntime.GOOS, container.TerminationMessagePath)
   469  			selinuxRelabel := selinux.GetEnabled()
   470  			volumeMounts = append(volumeMounts, &runtimeapi.Mount{
   471  				HostPath:       containerLogPath,
   472  				ContainerPath:  terminationMessagePath,
   473  				SelinuxRelabel: selinuxRelabel,
   474  			})
   475  		}
   476  	}
   477  
   478  	return volumeMounts
   479  }
   480  
   481  // getKubeletContainers lists containers managed by kubelet.
   482  // The boolean parameter specifies whether returns all containers including
   483  // those already exited and dead containers (used for garbage collection).
   484  func (m *kubeGenericRuntimeManager) getKubeletContainers(ctx context.Context, allContainers bool) ([]*runtimeapi.Container, error) {
   485  	filter := &runtimeapi.ContainerFilter{}
   486  	if !allContainers {
   487  		filter.State = &runtimeapi.ContainerStateValue{
   488  			State: runtimeapi.ContainerState_CONTAINER_RUNNING,
   489  		}
   490  	}
   491  
   492  	containers, err := m.runtimeService.ListContainers(ctx, filter)
   493  	if err != nil {
   494  		klog.ErrorS(err, "ListContainers failed")
   495  		return nil, err
   496  	}
   497  
   498  	return containers, nil
   499  }
   500  
   501  // makeUID returns a randomly generated string.
   502  func makeUID() string {
   503  	return fmt.Sprintf("%08x", rand.Uint32())
   504  }
   505  
   506  // getTerminationMessage looks on the filesystem for the provided termination message path, returning a limited
   507  // amount of those bytes, or returns true if the logs should be checked.
   508  func getTerminationMessage(status *runtimeapi.ContainerStatus, terminationMessagePath string, fallbackToLogs bool) (string, bool) {
   509  	if len(terminationMessagePath) == 0 {
   510  		return "", fallbackToLogs
   511  	}
   512  	// Volume Mounts fail on Windows if it is not of the form C:/
   513  	terminationMessagePath = volumeutil.MakeAbsolutePath(goruntime.GOOS, terminationMessagePath)
   514  	for _, mount := range status.Mounts {
   515  		if mount.ContainerPath != terminationMessagePath {
   516  			continue
   517  		}
   518  		path := mount.HostPath
   519  		data, _, err := tail.ReadAtMost(path, kubecontainer.MaxContainerTerminationMessageLength)
   520  		if err != nil {
   521  			if os.IsNotExist(err) {
   522  				return "", fallbackToLogs
   523  			}
   524  			return fmt.Sprintf("Error on reading termination log %s: %v", path, err), false
   525  		}
   526  		return string(data), (fallbackToLogs && len(data) == 0)
   527  	}
   528  	return "", fallbackToLogs
   529  }
   530  
   531  // readLastStringFromContainerLogs attempts to read up to the max log length from the end of the CRI log represented
   532  // by path. It reads up to max log lines.
   533  func (m *kubeGenericRuntimeManager) readLastStringFromContainerLogs(path string) string {
   534  	value := int64(kubecontainer.MaxContainerTerminationMessageLogLines)
   535  	buf, _ := circbuf.NewBuffer(kubecontainer.MaxContainerTerminationMessageLogLength)
   536  	if err := m.ReadLogs(context.Background(), path, "", &v1.PodLogOptions{TailLines: &value}, buf, buf); err != nil {
   537  		return fmt.Sprintf("Error on reading termination message from logs: %v", err)
   538  	}
   539  	return buf.String()
   540  }
   541  
   542  func (m *kubeGenericRuntimeManager) convertToKubeContainerStatus(status *runtimeapi.ContainerStatus) (cStatus *kubecontainer.Status) {
   543  	cStatus = toKubeContainerStatus(status, m.runtimeName)
   544  	if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
   545  		// Populate the termination message if needed.
   546  		annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
   547  		// If a container cannot even be started, it certainly does not have logs, so no need to fallbackToLogs.
   548  		fallbackToLogs := annotatedInfo.TerminationMessagePolicy == v1.TerminationMessageFallbackToLogsOnError &&
   549  			cStatus.ExitCode != 0 && cStatus.Reason != "ContainerCannotRun"
   550  		tMessage, checkLogs := getTerminationMessage(status, annotatedInfo.TerminationMessagePath, fallbackToLogs)
   551  		if checkLogs {
   552  			tMessage = m.readLastStringFromContainerLogs(status.GetLogPath())
   553  		}
   554  		// Enrich the termination message written by the application is not empty
   555  		if len(tMessage) != 0 {
   556  			if len(cStatus.Message) != 0 {
   557  				cStatus.Message += ": "
   558  			}
   559  			cStatus.Message += tMessage
   560  		}
   561  	}
   562  	return cStatus
   563  }
   564  
   565  // getPodContainerStatuses gets all containers' statuses for the pod.
   566  func (m *kubeGenericRuntimeManager) getPodContainerStatuses(ctx context.Context, uid kubetypes.UID, name, namespace string) ([]*kubecontainer.Status, error) {
   567  	// Select all containers of the given pod.
   568  	containers, err := m.runtimeService.ListContainers(ctx, &runtimeapi.ContainerFilter{
   569  		LabelSelector: map[string]string{kubelettypes.KubernetesPodUIDLabel: string(uid)},
   570  	})
   571  	if err != nil {
   572  		klog.ErrorS(err, "ListContainers error")
   573  		return nil, err
   574  	}
   575  
   576  	statuses := []*kubecontainer.Status{}
   577  	// TODO: optimization: set maximum number of containers per container name to examine.
   578  	for _, c := range containers {
   579  		resp, err := m.runtimeService.ContainerStatus(ctx, c.Id, false)
   580  		// Between List (ListContainers) and check (ContainerStatus) another thread might remove a container, and that is normal.
   581  		// The previous call (ListContainers) never fails due to a pod container not existing.
   582  		// Therefore, this method should not either, but instead act as if the previous call failed,
   583  		// which means the error should be ignored.
   584  		if crierror.IsNotFound(err) {
   585  			continue
   586  		}
   587  		if err != nil {
   588  			// Merely log this here; GetPodStatus will actually report the error out.
   589  			klog.V(4).InfoS("ContainerStatus return error", "containerID", c.Id, "err", err)
   590  			return nil, err
   591  		}
   592  		status := resp.GetStatus()
   593  		if status == nil {
   594  			return nil, remote.ErrContainerStatusNil
   595  		}
   596  		cStatus := m.convertToKubeContainerStatus(status)
   597  		statuses = append(statuses, cStatus)
   598  	}
   599  
   600  	sort.Sort(containerStatusByCreated(statuses))
   601  	return statuses, nil
   602  }
   603  
   604  func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName string) *kubecontainer.Status {
   605  	annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
   606  	labeledInfo := getContainerInfoFromLabels(status.Labels)
   607  	var cStatusResources *kubecontainer.ContainerResources
   608  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
   609  		// If runtime reports cpu & memory resources info, add it to container status
   610  		cStatusResources = toKubeContainerResources(status.Resources)
   611  	}
   612  	cStatus := &kubecontainer.Status{
   613  		ID: kubecontainer.ContainerID{
   614  			Type: runtimeName,
   615  			ID:   status.Id,
   616  		},
   617  		Name:                 labeledInfo.ContainerName,
   618  		Image:                status.Image.Image,
   619  		ImageID:              status.ImageRef,
   620  		ImageRuntimeHandler:  status.Image.RuntimeHandler,
   621  		Hash:                 annotatedInfo.Hash,
   622  		HashWithoutResources: annotatedInfo.HashWithoutResources,
   623  		RestartCount:         annotatedInfo.RestartCount,
   624  		State:                toKubeContainerState(status.State),
   625  		CreatedAt:            time.Unix(0, status.CreatedAt),
   626  		Resources:            cStatusResources,
   627  	}
   628  
   629  	if status.State != runtimeapi.ContainerState_CONTAINER_CREATED {
   630  		// If container is not in the created state, we have tried and
   631  		// started the container. Set the StartedAt time.
   632  		cStatus.StartedAt = time.Unix(0, status.StartedAt)
   633  	}
   634  	if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
   635  		cStatus.Reason = status.Reason
   636  		cStatus.Message = status.Message
   637  		cStatus.ExitCode = int(status.ExitCode)
   638  		cStatus.FinishedAt = time.Unix(0, status.FinishedAt)
   639  	}
   640  	return cStatus
   641  }
   642  
   643  // executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes.
   644  func (m *kubeGenericRuntimeManager) executePreStopHook(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 {
   645  	klog.V(3).InfoS("Running preStop hook", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerSpec.Name, "containerID", containerID.String())
   646  
   647  	start := metav1.Now()
   648  	done := make(chan struct{})
   649  	go func() {
   650  		defer close(done)
   651  		defer utilruntime.HandleCrash()
   652  		if _, err := m.runner.Run(ctx, containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil {
   653  			klog.ErrorS(err, "PreStop hook failed", "pod", klog.KObj(pod), "podUID", pod.UID,
   654  				"containerName", containerSpec.Name, "containerID", containerID.String())
   655  			// do not record the message in the event so that secrets won't leak from the server.
   656  			m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, "PreStopHook failed")
   657  		}
   658  	}()
   659  
   660  	select {
   661  	case <-time.After(time.Duration(gracePeriod) * time.Second):
   662  		klog.V(2).InfoS("PreStop hook not completed in grace period", "pod", klog.KObj(pod), "podUID", pod.UID,
   663  			"containerName", containerSpec.Name, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   664  	case <-done:
   665  		klog.V(3).InfoS("PreStop hook completed", "pod", klog.KObj(pod), "podUID", pod.UID,
   666  			"containerName", containerSpec.Name, "containerID", containerID.String())
   667  	}
   668  
   669  	return int64(metav1.Now().Sub(start.Time).Seconds())
   670  }
   671  
   672  // restoreSpecsFromContainerLabels restores all information needed for killing a container. In some
   673  // case we may not have pod and container spec when killing a container, e.g. pod is deleted during
   674  // kubelet restart.
   675  // To solve this problem, we've already written necessary information into container labels. Here we
   676  // just need to retrieve them from container labels and restore the specs.
   677  // TODO(random-liu): Add a node e2e test to test this behaviour.
   678  // TODO(random-liu): Change the lifecycle handler to just accept information needed, so that we can
   679  // just pass the needed function not create the fake object.
   680  func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(ctx context.Context, containerID kubecontainer.ContainerID) (*v1.Pod, *v1.Container, error) {
   681  	var pod *v1.Pod
   682  	var container *v1.Container
   683  	resp, err := m.runtimeService.ContainerStatus(ctx, containerID.ID, false)
   684  	if err != nil {
   685  		return nil, nil, err
   686  	}
   687  	s := resp.GetStatus()
   688  	if s == nil {
   689  		return nil, nil, remote.ErrContainerStatusNil
   690  	}
   691  
   692  	l := getContainerInfoFromLabels(s.Labels)
   693  	a := getContainerInfoFromAnnotations(s.Annotations)
   694  	// Notice that the followings are not full spec. The container killing code should not use
   695  	// un-restored fields.
   696  	pod = &v1.Pod{
   697  		ObjectMeta: metav1.ObjectMeta{
   698  			UID:                        l.PodUID,
   699  			Name:                       l.PodName,
   700  			Namespace:                  l.PodNamespace,
   701  			DeletionGracePeriodSeconds: a.PodDeletionGracePeriod,
   702  		},
   703  		Spec: v1.PodSpec{
   704  			TerminationGracePeriodSeconds: a.PodTerminationGracePeriod,
   705  		},
   706  	}
   707  	container = &v1.Container{
   708  		Name:                   l.ContainerName,
   709  		Ports:                  a.ContainerPorts,
   710  		TerminationMessagePath: a.TerminationMessagePath,
   711  	}
   712  	if a.PreStopHandler != nil {
   713  		container.Lifecycle = &v1.Lifecycle{
   714  			PreStop: a.PreStopHandler,
   715  		}
   716  	}
   717  	return pod, container, nil
   718  }
   719  
   720  // killContainer kills a container through the following steps:
   721  // * Run the pre-stop lifecycle hooks (if applicable).
   722  // * Stop the container.
   723  func (m *kubeGenericRuntimeManager) killContainer(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, reason containerKillReason, gracePeriodOverride *int64, ordering *terminationOrdering) error {
   724  	var containerSpec *v1.Container
   725  	if pod != nil {
   726  		if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil {
   727  			return fmt.Errorf("failed to get containerSpec %q (id=%q) in pod %q when killing container for reason %q",
   728  				containerName, containerID.String(), format.Pod(pod), message)
   729  		}
   730  	} else {
   731  		// Restore necessary information if one of the specs is nil.
   732  		restoredPod, restoredContainer, err := m.restoreSpecsFromContainerLabels(ctx, containerID)
   733  		if err != nil {
   734  			return err
   735  		}
   736  		pod, containerSpec = restoredPod, restoredContainer
   737  	}
   738  
   739  	// From this point, pod and container must be non-nil.
   740  	gracePeriod := setTerminationGracePeriod(pod, containerSpec, containerName, containerID, reason)
   741  
   742  	if len(message) == 0 {
   743  		message = fmt.Sprintf("Stopping container %s", containerSpec.Name)
   744  	}
   745  	m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message)
   746  
   747  	if gracePeriodOverride != nil {
   748  		gracePeriod = *gracePeriodOverride
   749  		klog.V(3).InfoS("Killing container with a grace period override", "pod", klog.KObj(pod), "podUID", pod.UID,
   750  			"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   751  	}
   752  
   753  	// Run the pre-stop lifecycle hooks if applicable and if there is enough time to run it
   754  	if containerSpec.Lifecycle != nil && containerSpec.Lifecycle.PreStop != nil && gracePeriod > 0 {
   755  		gracePeriod = gracePeriod - m.executePreStopHook(ctx, pod, containerID, containerSpec, gracePeriod)
   756  	}
   757  
   758  	// if we care about termination ordering, then wait for this container's turn to exit if there is
   759  	// time remaining
   760  	if ordering != nil && gracePeriod > 0 {
   761  		// grace period is only in seconds, so the time we've waited gets truncated downward
   762  		gracePeriod -= int64(ordering.waitForTurn(containerName, gracePeriod))
   763  	}
   764  
   765  	// always give containers a minimal shutdown window to avoid unnecessary SIGKILLs
   766  	if gracePeriod < minimumGracePeriodInSeconds {
   767  		gracePeriod = minimumGracePeriodInSeconds
   768  	}
   769  
   770  	klog.V(2).InfoS("Killing container with a grace period", "pod", klog.KObj(pod), "podUID", pod.UID,
   771  		"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   772  
   773  	err := m.runtimeService.StopContainer(ctx, containerID.ID, gracePeriod)
   774  	if err != nil && !crierror.IsNotFound(err) {
   775  		klog.ErrorS(err, "Container termination failed with gracePeriod", "pod", klog.KObj(pod), "podUID", pod.UID,
   776  			"containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod)
   777  		return err
   778  	}
   779  	klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID,
   780  		"containerName", containerName, "containerID", containerID.String())
   781  
   782  	if ordering != nil {
   783  		ordering.containerTerminated(containerName)
   784  	}
   785  
   786  	return nil
   787  }
   788  
   789  // killContainersWithSyncResult kills all pod's containers with sync results.
   790  func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(ctx context.Context, pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (syncResults []*kubecontainer.SyncResult) {
   791  	containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
   792  	wg := sync.WaitGroup{}
   793  
   794  	wg.Add(len(runningPod.Containers))
   795  	var termOrdering *terminationOrdering
   796  	// we only care about container termination ordering if the sidecars feature is enabled
   797  	if utilfeature.DefaultFeatureGate.Enabled(features.SidecarContainers) {
   798  		var runningContainerNames []string
   799  		for _, container := range runningPod.Containers {
   800  			runningContainerNames = append(runningContainerNames, container.Name)
   801  		}
   802  		termOrdering = newTerminationOrdering(pod, runningContainerNames)
   803  	}
   804  	for _, container := range runningPod.Containers {
   805  		go func(container *kubecontainer.Container) {
   806  			defer utilruntime.HandleCrash()
   807  			defer wg.Done()
   808  
   809  			killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
   810  			if err := m.killContainer(ctx, pod, container.ID, container.Name, "", reasonUnknown, gracePeriodOverride, termOrdering); err != nil {
   811  				killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
   812  				// Use runningPod for logging as the pod passed in could be *nil*.
   813  				klog.ErrorS(err, "Kill container failed", "pod", klog.KRef(runningPod.Namespace, runningPod.Name), "podUID", runningPod.ID,
   814  					"containerName", container.Name, "containerID", container.ID)
   815  			}
   816  			containerResults <- killContainerResult
   817  		}(container)
   818  	}
   819  	wg.Wait()
   820  	close(containerResults)
   821  
   822  	for containerResult := range containerResults {
   823  		syncResults = append(syncResults, containerResult)
   824  	}
   825  	return
   826  }
   827  
   828  // pruneInitContainersBeforeStart ensures that before we begin creating init
   829  // containers, we have reduced the number of outstanding init containers still
   830  // present. This reduces load on the container garbage collector by only
   831  // preserving the most recent terminated init container.
   832  func (m *kubeGenericRuntimeManager) pruneInitContainersBeforeStart(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) {
   833  	// only the last execution of each init container should be preserved, and only preserve it if it is in the
   834  	// list of init containers to keep.
   835  	initContainerNames := sets.NewString()
   836  	for _, container := range pod.Spec.InitContainers {
   837  		initContainerNames.Insert(container.Name)
   838  	}
   839  	for name := range initContainerNames {
   840  		count := 0
   841  		for _, status := range podStatus.ContainerStatuses {
   842  			if status.Name != name ||
   843  				(status.State != kubecontainer.ContainerStateExited &&
   844  					status.State != kubecontainer.ContainerStateUnknown) {
   845  				continue
   846  			}
   847  			// Remove init containers in unknown state. It should have
   848  			// been stopped before pruneInitContainersBeforeStart is
   849  			// called.
   850  			count++
   851  			// keep the first init container for this name
   852  			if count == 1 {
   853  				continue
   854  			}
   855  			// prune all other init containers that match this container name
   856  			klog.V(4).InfoS("Removing init container", "containerName", status.Name, "containerID", status.ID.ID, "count", count)
   857  			if err := m.removeContainer(ctx, status.ID.ID); err != nil {
   858  				utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod)))
   859  				continue
   860  			}
   861  		}
   862  	}
   863  }
   864  
   865  // Remove all init containers. Note that this function does not check the state
   866  // of the container because it assumes all init containers have been stopped
   867  // before the call happens.
   868  func (m *kubeGenericRuntimeManager) purgeInitContainers(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) {
   869  	initContainerNames := sets.NewString()
   870  	for _, container := range pod.Spec.InitContainers {
   871  		initContainerNames.Insert(container.Name)
   872  	}
   873  	for name := range initContainerNames {
   874  		count := 0
   875  		for _, status := range podStatus.ContainerStatuses {
   876  			if status.Name != name {
   877  				continue
   878  			}
   879  			count++
   880  			// Purge all init containers that match this container name
   881  			klog.V(4).InfoS("Removing init container", "containerName", status.Name, "containerID", status.ID.ID, "count", count)
   882  			if err := m.removeContainer(ctx, status.ID.ID); err != nil {
   883  				utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod)))
   884  				continue
   885  			}
   886  		}
   887  	}
   888  }
   889  
   890  // findNextInitContainerToRun returns the status of the last failed container, the
   891  // index of next init container to start, or done if there are no further init containers.
   892  // Status is only returned if an init container is failed, in which case next will
   893  // point to the current container.
   894  func findNextInitContainerToRun(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (status *kubecontainer.Status, next *v1.Container, done bool) {
   895  	if len(pod.Spec.InitContainers) == 0 {
   896  		return nil, nil, true
   897  	}
   898  
   899  	// If any of the main containers have status and are Running, then all init containers must
   900  	// have been executed at some point in the past.  However, they could have been removed
   901  	// from the container runtime now, and if we proceed, it would appear as if they
   902  	// never ran and will re-execute improperly.
   903  	for i := range pod.Spec.Containers {
   904  		container := &pod.Spec.Containers[i]
   905  		status := podStatus.FindContainerStatusByName(container.Name)
   906  		if status != nil && status.State == kubecontainer.ContainerStateRunning {
   907  			return nil, nil, true
   908  		}
   909  	}
   910  
   911  	// If there are failed containers, return the status of the last failed one.
   912  	for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
   913  		container := &pod.Spec.InitContainers[i]
   914  		status := podStatus.FindContainerStatusByName(container.Name)
   915  		if status != nil && isInitContainerFailed(status) {
   916  			return status, container, false
   917  		}
   918  	}
   919  
   920  	// There are no failed containers now.
   921  	for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
   922  		container := &pod.Spec.InitContainers[i]
   923  		status := podStatus.FindContainerStatusByName(container.Name)
   924  		if status == nil {
   925  			continue
   926  		}
   927  
   928  		// container is still running, return not done.
   929  		if status.State == kubecontainer.ContainerStateRunning {
   930  			return nil, nil, false
   931  		}
   932  
   933  		if status.State == kubecontainer.ContainerStateExited {
   934  			// all init containers successful
   935  			if i == (len(pod.Spec.InitContainers) - 1) {
   936  				return nil, nil, true
   937  			}
   938  
   939  			// all containers up to i successful, go to i+1
   940  			return nil, &pod.Spec.InitContainers[i+1], false
   941  		}
   942  	}
   943  
   944  	return nil, &pod.Spec.InitContainers[0], false
   945  }
   946  
   947  // hasAnyRegularContainerCreated returns true if any regular container has been
   948  // created, which indicates all init containers have been initialized.
   949  func hasAnyRegularContainerCreated(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool {
   950  	for _, container := range pod.Spec.Containers {
   951  		status := podStatus.FindContainerStatusByName(container.Name)
   952  		if status == nil {
   953  			continue
   954  		}
   955  		switch status.State {
   956  		case kubecontainer.ContainerStateCreated,
   957  			kubecontainer.ContainerStateRunning,
   958  			kubecontainer.ContainerStateExited:
   959  			return true
   960  		default:
   961  			// Ignore other states
   962  		}
   963  	}
   964  	return false
   965  }
   966  
   967  // computeInitContainerActions sets the actions on the given changes that need
   968  // to be taken for the init containers. This includes actions to initialize the
   969  // init containers and actions to keep restartable init containers running.
   970  // computeInitContainerActions returns true if pod has been initialized.
   971  //
   972  // The actions include:
   973  // - Start the first init container that has not been started.
   974  // - Restart all restartable init containers that have started but are not running.
   975  // - Kill the restartable init containers that are not alive or started.
   976  //
   977  // Note that this is a function for the SidecarContainers feature.
   978  // Please sync with the findNextInitContainerToRun function if any changes are
   979  // made, as either this or that function will be called.
   980  func (m *kubeGenericRuntimeManager) computeInitContainerActions(pod *v1.Pod, podStatus *kubecontainer.PodStatus, changes *podActions) bool {
   981  	if len(pod.Spec.InitContainers) == 0 {
   982  		return true
   983  	}
   984  
   985  	// If any of the main containers have status and are Running, then all init containers must
   986  	// have been executed at some point in the past.  However, they could have been removed
   987  	// from the container runtime now, and if we proceed, it would appear as if they
   988  	// never ran and will re-execute improperly except for the restartable init containers.
   989  	podHasInitialized := false
   990  	for _, container := range pod.Spec.Containers {
   991  		status := podStatus.FindContainerStatusByName(container.Name)
   992  		if status == nil {
   993  			continue
   994  		}
   995  		switch status.State {
   996  		case kubecontainer.ContainerStateCreated,
   997  			kubecontainer.ContainerStateRunning:
   998  			podHasInitialized = true
   999  		case kubecontainer.ContainerStateExited:
  1000  			// This is a workaround for the issue that the kubelet cannot
  1001  			// differentiate the container statuses of the previous podSandbox
  1002  			// from the current one.
  1003  			// If the node is rebooted, all containers will be in the exited
  1004  			// state and the kubelet will try to recreate a new podSandbox.
  1005  			// In this case, the kubelet should not mistakenly think that
  1006  			// the newly created podSandbox has been initialized.
  1007  		default:
  1008  			// Ignore other states
  1009  		}
  1010  		if podHasInitialized {
  1011  			break
  1012  		}
  1013  	}
  1014  
  1015  	// isPreviouslyInitialized indicates if the current init container is
  1016  	// previously initialized.
  1017  	isPreviouslyInitialized := podHasInitialized
  1018  	restartOnFailure := shouldRestartOnFailure(pod)
  1019  
  1020  	// Note that we iterate through the init containers in reverse order to find
  1021  	// the next init container to run, as the completed init containers may get
  1022  	// removed from container runtime for various reasons. Therefore the kubelet
  1023  	// should rely on the minimal number of init containers - the last one.
  1024  	//
  1025  	// Once we find the next init container to run, iterate through the rest to
  1026  	// find the restartable init containers to restart.
  1027  	for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
  1028  		container := &pod.Spec.InitContainers[i]
  1029  		status := podStatus.FindContainerStatusByName(container.Name)
  1030  		klog.V(4).InfoS("Computing init container action", "pod", klog.KObj(pod), "container", container.Name, "status", status)
  1031  		if status == nil {
  1032  			// If the container is previously initialized but its status is not
  1033  			// found, it means its last status is removed for some reason.
  1034  			// Restart it if it is a restartable init container.
  1035  			if isPreviouslyInitialized && types.IsRestartableInitContainer(container) {
  1036  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1037  			}
  1038  			continue
  1039  		}
  1040  
  1041  		if isPreviouslyInitialized && !types.IsRestartableInitContainer(container) {
  1042  			// after initialization, only restartable init containers need to be kept
  1043  			// running
  1044  			continue
  1045  		}
  1046  
  1047  		switch status.State {
  1048  		case kubecontainer.ContainerStateCreated:
  1049  			// nothing to do but wait for it to start
  1050  
  1051  		case kubecontainer.ContainerStateRunning:
  1052  			if !types.IsRestartableInitContainer(container) {
  1053  				break
  1054  			}
  1055  
  1056  			if types.IsRestartableInitContainer(container) {
  1057  				if container.StartupProbe != nil {
  1058  					startup, found := m.startupManager.Get(status.ID)
  1059  					if !found {
  1060  						// If the startup probe has not been run, wait for it.
  1061  						break
  1062  					}
  1063  					if startup != proberesults.Success {
  1064  						if startup == proberesults.Failure {
  1065  							// If the restartable init container failed the startup probe,
  1066  							// restart it.
  1067  							changes.ContainersToKill[status.ID] = containerToKillInfo{
  1068  								name:      container.Name,
  1069  								container: container,
  1070  								message:   fmt.Sprintf("Init container %s failed startup probe", container.Name),
  1071  								reason:    reasonStartupProbe,
  1072  							}
  1073  							changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1074  						}
  1075  						break
  1076  					}
  1077  				}
  1078  
  1079  				klog.V(4).InfoS("Init container has been initialized", "pod", klog.KObj(pod), "container", container.Name)
  1080  				if i == (len(pod.Spec.InitContainers) - 1) {
  1081  					podHasInitialized = true
  1082  				} else if !isPreviouslyInitialized {
  1083  					// this init container is initialized for the first time, start the next one
  1084  					changes.InitContainersToStart = append(changes.InitContainersToStart, i+1)
  1085  				}
  1086  
  1087  				// A restartable init container does not have to take into account its
  1088  				// liveness probe when it determines to start the next init container.
  1089  				if container.LivenessProbe != nil {
  1090  					liveness, found := m.livenessManager.Get(status.ID)
  1091  					if !found {
  1092  						// If the liveness probe has not been run, wait for it.
  1093  						break
  1094  					}
  1095  					if liveness == proberesults.Failure {
  1096  						// If the restartable init container failed the liveness probe,
  1097  						// restart it.
  1098  						changes.ContainersToKill[status.ID] = containerToKillInfo{
  1099  							name:      container.Name,
  1100  							container: container,
  1101  							message:   fmt.Sprintf("Init container %s failed liveness probe", container.Name),
  1102  							reason:    reasonLivenessProbe,
  1103  						}
  1104  						changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1105  					}
  1106  				}
  1107  			} else { // init container
  1108  				// nothing do to but wait for it to finish
  1109  				break
  1110  			}
  1111  
  1112  		// If the init container failed and the restart policy is Never, the pod is terminal.
  1113  		// Otherwise, restart the init container.
  1114  		case kubecontainer.ContainerStateExited:
  1115  			if types.IsRestartableInitContainer(container) {
  1116  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1117  			} else { // init container
  1118  				if isInitContainerFailed(status) {
  1119  					if !restartOnFailure {
  1120  						changes.KillPod = true
  1121  						changes.InitContainersToStart = nil
  1122  						return false
  1123  					}
  1124  					changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1125  					break
  1126  				}
  1127  
  1128  				klog.V(4).InfoS("Init container has been initialized", "pod", klog.KObj(pod), "container", container.Name)
  1129  				if i == (len(pod.Spec.InitContainers) - 1) {
  1130  					podHasInitialized = true
  1131  				} else {
  1132  					// this init container is initialized for the first time, start the next one
  1133  					changes.InitContainersToStart = append(changes.InitContainersToStart, i+1)
  1134  				}
  1135  			}
  1136  
  1137  		default: // kubecontainer.ContainerStatusUnknown or other unknown states
  1138  			if types.IsRestartableInitContainer(container) {
  1139  				// If the restartable init container is in unknown state, restart it.
  1140  				changes.ContainersToKill[status.ID] = containerToKillInfo{
  1141  					name:      container.Name,
  1142  					container: container,
  1143  					message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
  1144  						status.State),
  1145  					reason: reasonUnknown,
  1146  				}
  1147  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1148  			} else { // init container
  1149  				if !isInitContainerFailed(status) {
  1150  					klog.V(4).InfoS("This should not happen, init container is in unknown state but not failed", "pod", klog.KObj(pod), "containerStatus", status)
  1151  				}
  1152  
  1153  				if !restartOnFailure {
  1154  					changes.KillPod = true
  1155  					changes.InitContainersToStart = nil
  1156  					return false
  1157  				}
  1158  
  1159  				// If the init container is in unknown state, restart it.
  1160  				changes.ContainersToKill[status.ID] = containerToKillInfo{
  1161  					name:      container.Name,
  1162  					container: container,
  1163  					message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
  1164  						status.State),
  1165  					reason: reasonUnknown,
  1166  				}
  1167  				changes.InitContainersToStart = append(changes.InitContainersToStart, i)
  1168  			}
  1169  		}
  1170  
  1171  		if !isPreviouslyInitialized {
  1172  			// the one before this init container has been initialized
  1173  			isPreviouslyInitialized = true
  1174  		}
  1175  	}
  1176  
  1177  	// this means no init containers have been started,
  1178  	// start the first one
  1179  	if !isPreviouslyInitialized {
  1180  		changes.InitContainersToStart = append(changes.InitContainersToStart, 0)
  1181  	}
  1182  
  1183  	// reverse the InitContainersToStart, as the above loop iterated through the
  1184  	// init containers backwards, but we want to start them as per the order in
  1185  	// the pod spec.
  1186  	l := len(changes.InitContainersToStart)
  1187  	for i := 0; i < l/2; i++ {
  1188  		changes.InitContainersToStart[i], changes.InitContainersToStart[l-1-i] =
  1189  			changes.InitContainersToStart[l-1-i], changes.InitContainersToStart[i]
  1190  	}
  1191  
  1192  	return podHasInitialized
  1193  }
  1194  
  1195  // GetContainerLogs returns logs of a specific container.
  1196  func (m *kubeGenericRuntimeManager) GetContainerLogs(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) (err error) {
  1197  	resp, err := m.runtimeService.ContainerStatus(ctx, containerID.ID, false)
  1198  	if err != nil {
  1199  		klog.V(4).InfoS("Failed to get container status", "containerID", containerID.String(), "err", err)
  1200  		return fmt.Errorf("unable to retrieve container logs for %v", containerID.String())
  1201  	}
  1202  	status := resp.GetStatus()
  1203  	if status == nil {
  1204  		return remote.ErrContainerStatusNil
  1205  	}
  1206  	return m.ReadLogs(ctx, status.GetLogPath(), containerID.ID, logOptions, stdout, stderr)
  1207  }
  1208  
  1209  // GetExec gets the endpoint the runtime will serve the exec request from.
  1210  func (m *kubeGenericRuntimeManager) GetExec(ctx context.Context, id kubecontainer.ContainerID, cmd []string, stdin, stdout, stderr, tty bool) (*url.URL, error) {
  1211  	req := &runtimeapi.ExecRequest{
  1212  		ContainerId: id.ID,
  1213  		Cmd:         cmd,
  1214  		Tty:         tty,
  1215  		Stdin:       stdin,
  1216  		Stdout:      stdout,
  1217  		Stderr:      stderr,
  1218  	}
  1219  	resp, err := m.runtimeService.Exec(ctx, req)
  1220  	if err != nil {
  1221  		return nil, err
  1222  	}
  1223  
  1224  	return url.Parse(resp.Url)
  1225  }
  1226  
  1227  // GetAttach gets the endpoint the runtime will serve the attach request from.
  1228  func (m *kubeGenericRuntimeManager) GetAttach(ctx context.Context, id kubecontainer.ContainerID, stdin, stdout, stderr, tty bool) (*url.URL, error) {
  1229  	req := &runtimeapi.AttachRequest{
  1230  		ContainerId: id.ID,
  1231  		Stdin:       stdin,
  1232  		Stdout:      stdout,
  1233  		Stderr:      stderr,
  1234  		Tty:         tty,
  1235  	}
  1236  	resp, err := m.runtimeService.Attach(ctx, req)
  1237  	if err != nil {
  1238  		return nil, err
  1239  	}
  1240  	return url.Parse(resp.Url)
  1241  }
  1242  
  1243  // RunInContainer synchronously executes the command in the container, and returns the output.
  1244  func (m *kubeGenericRuntimeManager) RunInContainer(ctx context.Context, id kubecontainer.ContainerID, cmd []string, timeout time.Duration) ([]byte, error) {
  1245  	stdout, stderr, err := m.runtimeService.ExecSync(ctx, id.ID, cmd, timeout)
  1246  	// NOTE(tallclair): This does not correctly interleave stdout & stderr, but should be sufficient
  1247  	// for logging purposes. A combined output option will need to be added to the ExecSyncRequest
  1248  	// if more precise output ordering is ever required.
  1249  	return append(stdout, stderr...), err
  1250  }
  1251  
  1252  // removeContainer removes the container and the container logs.
  1253  // Notice that we remove the container logs first, so that container will not be removed if
  1254  // container logs are failed to be removed, and kubelet will retry this later. This guarantees
  1255  // that container logs to be removed with the container.
  1256  // Notice that we assume that the container should only be removed in non-running state, and
  1257  // it will not write container logs anymore in that state.
  1258  func (m *kubeGenericRuntimeManager) removeContainer(ctx context.Context, containerID string) error {
  1259  	klog.V(4).InfoS("Removing container", "containerID", containerID)
  1260  	// Call internal container post-stop lifecycle hook.
  1261  	if err := m.internalLifecycle.PostStopContainer(containerID); err != nil {
  1262  		return err
  1263  	}
  1264  
  1265  	// Remove the container log.
  1266  	// TODO: Separate log and container lifecycle management.
  1267  	if err := m.removeContainerLog(ctx, containerID); err != nil {
  1268  		return err
  1269  	}
  1270  	// Remove the container.
  1271  	return m.runtimeService.RemoveContainer(ctx, containerID)
  1272  }
  1273  
  1274  // removeContainerLog removes the container log.
  1275  func (m *kubeGenericRuntimeManager) removeContainerLog(ctx context.Context, containerID string) error {
  1276  	// Use log manager to remove rotated logs.
  1277  	err := m.logManager.Clean(ctx, containerID)
  1278  	if err != nil {
  1279  		return err
  1280  	}
  1281  
  1282  	resp, err := m.runtimeService.ContainerStatus(ctx, containerID, false)
  1283  	if err != nil {
  1284  		return fmt.Errorf("failed to get container status %q: %v", containerID, err)
  1285  	}
  1286  	status := resp.GetStatus()
  1287  	if status == nil {
  1288  		return remote.ErrContainerStatusNil
  1289  	}
  1290  	// Remove the legacy container log symlink.
  1291  	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
  1292  	labeledInfo := getContainerInfoFromLabels(status.Labels)
  1293  	legacySymlink := legacyLogSymlink(containerID, labeledInfo.ContainerName, labeledInfo.PodName,
  1294  		labeledInfo.PodNamespace)
  1295  	if err := m.osInterface.Remove(legacySymlink); err != nil && !os.IsNotExist(err) {
  1296  		return fmt.Errorf("failed to remove container %q log legacy symbolic link %q: %v",
  1297  			containerID, legacySymlink, err)
  1298  	}
  1299  	return nil
  1300  }
  1301  
  1302  // DeleteContainer removes a container.
  1303  func (m *kubeGenericRuntimeManager) DeleteContainer(ctx context.Context, containerID kubecontainer.ContainerID) error {
  1304  	return m.removeContainer(ctx, containerID.ID)
  1305  }
  1306  
  1307  // setTerminationGracePeriod determines the grace period to use when killing a container
  1308  func setTerminationGracePeriod(pod *v1.Pod, containerSpec *v1.Container, containerName string, containerID kubecontainer.ContainerID, reason containerKillReason) int64 {
  1309  	gracePeriod := int64(minimumGracePeriodInSeconds)
  1310  	switch {
  1311  	case pod.DeletionGracePeriodSeconds != nil:
  1312  		return *pod.DeletionGracePeriodSeconds
  1313  	case pod.Spec.TerminationGracePeriodSeconds != nil:
  1314  		switch reason {
  1315  		case reasonStartupProbe:
  1316  			if isProbeTerminationGracePeriodSecondsSet(pod, containerSpec, containerSpec.StartupProbe, containerName, containerID, "StartupProbe") {
  1317  				return *containerSpec.StartupProbe.TerminationGracePeriodSeconds
  1318  			}
  1319  		case reasonLivenessProbe:
  1320  			if isProbeTerminationGracePeriodSecondsSet(pod, containerSpec, containerSpec.LivenessProbe, containerName, containerID, "LivenessProbe") {
  1321  				return *containerSpec.LivenessProbe.TerminationGracePeriodSeconds
  1322  			}
  1323  		}
  1324  		return *pod.Spec.TerminationGracePeriodSeconds
  1325  	}
  1326  	return gracePeriod
  1327  }
  1328  
  1329  func isProbeTerminationGracePeriodSecondsSet(pod *v1.Pod, containerSpec *v1.Container, probe *v1.Probe, containerName string, containerID kubecontainer.ContainerID, probeType string) bool {
  1330  	if probe != nil && probe.TerminationGracePeriodSeconds != nil {
  1331  		if *probe.TerminationGracePeriodSeconds > *pod.Spec.TerminationGracePeriodSeconds {
  1332  			klog.V(4).InfoS("Using probe-level grace period that is greater than the pod-level grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerName, "containerID", containerID.String(), "probeType", probeType, "probeGracePeriod", *probe.TerminationGracePeriodSeconds, "podGracePeriod", *pod.Spec.TerminationGracePeriodSeconds)
  1333  		}
  1334  		return true
  1335  	}
  1336  	return false
  1337  }