k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kuberuntime/kuberuntime_container.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kuberuntime 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "io" 24 "math/rand" 25 "net/url" 26 "os" 27 "path/filepath" 28 "regexp" 29 goruntime "runtime" 30 "sort" 31 "strconv" 32 "strings" 33 "sync" 34 "time" 35 36 crierror "k8s.io/cri-api/pkg/errors" 37 38 "github.com/opencontainers/selinux/go-selinux" 39 grpcstatus "google.golang.org/grpc/status" 40 41 "github.com/armon/circbuf" 42 "k8s.io/klog/v2" 43 44 v1 "k8s.io/api/core/v1" 45 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 46 kubetypes "k8s.io/apimachinery/pkg/types" 47 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 48 "k8s.io/apimachinery/pkg/util/sets" 49 utilfeature "k8s.io/apiserver/pkg/util/feature" 50 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 51 remote "k8s.io/cri-client/pkg" 52 kubelettypes "k8s.io/kubelet/pkg/types" 53 "k8s.io/kubernetes/pkg/features" 54 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 55 "k8s.io/kubernetes/pkg/kubelet/events" 56 proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" 57 "k8s.io/kubernetes/pkg/kubelet/types" 58 "k8s.io/kubernetes/pkg/kubelet/util/format" 59 "k8s.io/kubernetes/pkg/util/tail" 60 volumeutil "k8s.io/kubernetes/pkg/volume/util" 61 ) 62 63 var ( 64 // ErrCreateContainerConfig - failed to create container config 65 ErrCreateContainerConfig = errors.New("CreateContainerConfigError") 66 // ErrPreCreateHook - failed to execute PreCreateHook 67 ErrPreCreateHook = errors.New("PreCreateHookError") 68 // ErrCreateContainer - failed to create container 69 ErrCreateContainer = errors.New("CreateContainerError") 70 // ErrPreStartHook - failed to execute PreStartHook 71 ErrPreStartHook = errors.New("PreStartHookError") 72 // ErrPostStartHook - failed to execute PostStartHook 73 ErrPostStartHook = errors.New("PostStartHookError") 74 ) 75 76 // recordContainerEvent should be used by the runtime manager for all container related events. 77 // it has sanity checks to ensure that we do not write events that can abuse our masters. 78 // in particular, it ensures that a containerID never appears in an event message as that 79 // is prone to causing a lot of distinct events that do not count well. 80 // it replaces any reference to a containerID with the containerName which is stable, and is what users know. 81 func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) { 82 ref, err := kubecontainer.GenerateContainerRef(pod, container) 83 if err != nil { 84 klog.ErrorS(err, "Can't make a container ref", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name) 85 return 86 } 87 eventMessage := message 88 if len(args) > 0 { 89 eventMessage = fmt.Sprintf(message, args...) 90 } 91 // this is a hack, but often the error from the runtime includes the containerID 92 // which kills our ability to deduplicate events. this protection makes a huge 93 // difference in the number of unique events 94 if containerID != "" { 95 eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1) 96 } 97 m.recorder.Event(ref, eventType, reason, eventMessage) 98 } 99 100 // startSpec wraps the spec required to start a container, either a regular/init container 101 // or an ephemeral container. Ephemeral containers contain all the fields of regular/init 102 // containers, plus some additional fields. In both cases startSpec.container will be set. 103 type startSpec struct { 104 container *v1.Container 105 ephemeralContainer *v1.EphemeralContainer 106 } 107 108 func containerStartSpec(c *v1.Container) *startSpec { 109 return &startSpec{container: c} 110 } 111 112 func ephemeralContainerStartSpec(ec *v1.EphemeralContainer) *startSpec { 113 return &startSpec{ 114 container: (*v1.Container)(&ec.EphemeralContainerCommon), 115 ephemeralContainer: ec, 116 } 117 } 118 119 // getTargetID returns the kubecontainer.ContainerID for ephemeral container namespace 120 // targeting. The target is stored as EphemeralContainer.TargetContainerName, which must be 121 // resolved to a ContainerID using podStatus. The target container must already exist, which 122 // usually isn't a problem since ephemeral containers aren't allowed at pod creation time. 123 func (s *startSpec) getTargetID(podStatus *kubecontainer.PodStatus) (*kubecontainer.ContainerID, error) { 124 if s.ephemeralContainer == nil || s.ephemeralContainer.TargetContainerName == "" { 125 return nil, nil 126 } 127 128 targetStatus := podStatus.FindContainerStatusByName(s.ephemeralContainer.TargetContainerName) 129 if targetStatus == nil { 130 return nil, fmt.Errorf("unable to find target container %v", s.ephemeralContainer.TargetContainerName) 131 } 132 133 return &targetStatus.ID, nil 134 } 135 136 func calcRestartCountByLogDir(path string) (int, error) { 137 // if the path doesn't exist then it's not an error 138 if _, err := os.Stat(path); err != nil { 139 return 0, nil 140 } 141 files, err := os.ReadDir(path) 142 if err != nil { 143 return 0, err 144 } 145 if len(files) == 0 { 146 return 0, nil 147 } 148 restartCount := 0 149 restartCountLogFileRegex := regexp.MustCompile(`^(\d+)\.log(\..*)?`) 150 for _, file := range files { 151 if file.IsDir() { 152 continue 153 } 154 matches := restartCountLogFileRegex.FindStringSubmatch(file.Name()) 155 if len(matches) == 0 { 156 continue 157 } 158 count, err := strconv.Atoi(matches[1]) 159 if err != nil { 160 // unlikely kubelet created this file, 161 // likely custom file with random numbers as a name 162 continue 163 } 164 count++ 165 if count > restartCount { 166 restartCount = count 167 } 168 } 169 return restartCount, nil 170 } 171 172 // startContainer starts a container and returns a message indicates why it is failed on error. 173 // It starts the container through the following steps: 174 // * pull the image 175 // * create the container 176 // * start the container 177 // * run the post start lifecycle hooks (if applicable) 178 func (m *kubeGenericRuntimeManager) startContainer(ctx context.Context, podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) { 179 container := spec.container 180 181 // Step 1: pull the image. 182 183 // If RuntimeClassInImageCriAPI feature gate is enabled, pass runtimehandler 184 // information for the runtime class specified. If not runtime class is 185 // specified, then pass "" 186 podRuntimeHandler := "" 187 var err error 188 if utilfeature.DefaultFeatureGate.Enabled(features.RuntimeClassInImageCriAPI) { 189 if pod.Spec.RuntimeClassName != nil && *pod.Spec.RuntimeClassName != "" { 190 podRuntimeHandler, err = m.runtimeClassManager.LookupRuntimeHandler(pod.Spec.RuntimeClassName) 191 if err != nil { 192 msg := fmt.Sprintf("Failed to lookup runtimeHandler for runtimeClassName %v", pod.Spec.RuntimeClassName) 193 return msg, err 194 } 195 } 196 } 197 198 imageRef, msg, err := m.imagePuller.EnsureImageExists(ctx, pod, container, pullSecrets, podSandboxConfig, podRuntimeHandler) 199 if err != nil { 200 s, _ := grpcstatus.FromError(err) 201 m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message()) 202 return msg, err 203 } 204 205 // Step 2: create the container. 206 // For a new container, the RestartCount should be 0 207 restartCount := 0 208 containerStatus := podStatus.FindContainerStatusByName(container.Name) 209 if containerStatus != nil { 210 restartCount = containerStatus.RestartCount + 1 211 } else { 212 // The container runtime keeps state on container statuses and 213 // what the container restart count is. When nodes are rebooted 214 // some container runtimes clear their state which causes the 215 // restartCount to be reset to 0. This causes the logfile to 216 // start at 0.log, which either overwrites or appends to the 217 // already existing log. 218 // 219 // We are checking to see if the log directory exists, and find 220 // the latest restartCount by checking the log name - 221 // {restartCount}.log - and adding 1 to it. 222 logDir := BuildContainerLogsDirectory(m.podLogsDirectory, pod.Namespace, pod.Name, pod.UID, container.Name) 223 restartCount, err = calcRestartCountByLogDir(logDir) 224 if err != nil { 225 klog.InfoS("Cannot calculate restartCount from the log directory", "logDir", logDir, "err", err) 226 restartCount = 0 227 } 228 } 229 230 target, err := spec.getTargetID(podStatus) 231 if err != nil { 232 s, _ := grpcstatus.FromError(err) 233 m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message()) 234 return s.Message(), ErrCreateContainerConfig 235 } 236 237 containerConfig, cleanupAction, err := m.generateContainerConfig(ctx, container, pod, restartCount, podIP, imageRef, podIPs, target) 238 if cleanupAction != nil { 239 defer cleanupAction() 240 } 241 if err != nil { 242 s, _ := grpcstatus.FromError(err) 243 m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message()) 244 return s.Message(), ErrCreateContainerConfig 245 } 246 247 err = m.internalLifecycle.PreCreateContainer(pod, container, containerConfig) 248 if err != nil { 249 s, _ := grpcstatus.FromError(err) 250 m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Internal PreCreateContainer hook failed: %v", s.Message()) 251 return s.Message(), ErrPreCreateHook 252 } 253 254 containerID, err := m.runtimeService.CreateContainer(ctx, podSandboxID, containerConfig, podSandboxConfig) 255 if err != nil { 256 s, _ := grpcstatus.FromError(err) 257 m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message()) 258 return s.Message(), ErrCreateContainer 259 } 260 err = m.internalLifecycle.PreStartContainer(pod, container, containerID) 261 if err != nil { 262 s, _ := grpcstatus.FromError(err) 263 m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Internal PreStartContainer hook failed: %v", s.Message()) 264 return s.Message(), ErrPreStartHook 265 } 266 m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, fmt.Sprintf("Created container %s", container.Name)) 267 268 // Step 3: start the container. 269 err = m.runtimeService.StartContainer(ctx, containerID) 270 if err != nil { 271 s, _ := grpcstatus.FromError(err) 272 m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", s.Message()) 273 return s.Message(), kubecontainer.ErrRunContainer 274 } 275 m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, fmt.Sprintf("Started container %s", container.Name)) 276 277 // Symlink container logs to the legacy container log location for cluster logging 278 // support. 279 // TODO(random-liu): Remove this after cluster logging supports CRI container log path. 280 containerMeta := containerConfig.GetMetadata() 281 sandboxMeta := podSandboxConfig.GetMetadata() 282 legacySymlink := legacyLogSymlink(containerID, containerMeta.Name, sandboxMeta.Name, 283 sandboxMeta.Namespace) 284 containerLog := filepath.Join(podSandboxConfig.LogDirectory, containerConfig.LogPath) 285 // only create legacy symlink if containerLog path exists (or the error is not IsNotExist). 286 // Because if containerLog path does not exist, only dangling legacySymlink is created. 287 // This dangling legacySymlink is later removed by container gc, so it does not make sense 288 // to create it in the first place. it happens when journald logging driver is used with docker. 289 if _, err := m.osInterface.Stat(containerLog); !os.IsNotExist(err) { 290 if err := m.osInterface.Symlink(containerLog, legacySymlink); err != nil { 291 klog.ErrorS(err, "Failed to create legacy symbolic link", "path", legacySymlink, 292 "containerID", containerID, "containerLogPath", containerLog) 293 } 294 } 295 296 // Step 4: execute the post start hook. 297 if container.Lifecycle != nil && container.Lifecycle.PostStart != nil { 298 kubeContainerID := kubecontainer.ContainerID{ 299 Type: m.runtimeName, 300 ID: containerID, 301 } 302 msg, handlerErr := m.runner.Run(ctx, kubeContainerID, pod, container, container.Lifecycle.PostStart) 303 if handlerErr != nil { 304 klog.ErrorS(handlerErr, "Failed to execute PostStartHook", "pod", klog.KObj(pod), 305 "podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String()) 306 // do not record the message in the event so that secrets won't leak from the server. 307 m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, "PostStartHook failed") 308 if err := m.killContainer(ctx, pod, kubeContainerID, container.Name, "FailedPostStartHook", reasonFailedPostStartHook, nil, nil); err != nil { 309 klog.ErrorS(err, "Failed to kill container", "pod", klog.KObj(pod), 310 "podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String()) 311 } 312 return msg, ErrPostStartHook 313 } 314 } 315 316 return "", nil 317 } 318 319 // generateContainerConfig generates container config for kubelet runtime v1. 320 func (m *kubeGenericRuntimeManager) generateContainerConfig(ctx context.Context, container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string, podIPs []string, nsTarget *kubecontainer.ContainerID) (*runtimeapi.ContainerConfig, func(), error) { 321 opts, cleanupAction, err := m.runtimeHelper.GenerateRunContainerOptions(ctx, pod, container, podIP, podIPs) 322 if err != nil { 323 return nil, nil, err 324 } 325 326 uid, username, err := m.getImageUser(ctx, container.Image) 327 if err != nil { 328 return nil, cleanupAction, err 329 } 330 331 // Verify RunAsNonRoot. Non-root verification only supports numeric user. 332 if err := verifyRunAsNonRoot(pod, container, uid, username); err != nil { 333 return nil, cleanupAction, err 334 } 335 336 command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs) 337 logDir := BuildContainerLogsDirectory(m.podLogsDirectory, pod.Namespace, pod.Name, pod.UID, container.Name) 338 err = m.osInterface.MkdirAll(logDir, 0755) 339 if err != nil { 340 return nil, cleanupAction, fmt.Errorf("create container log directory for container %s failed: %v", container.Name, err) 341 } 342 containerLogsPath := buildContainerLogsPath(container.Name, restartCount) 343 restartCountUint32 := uint32(restartCount) 344 config := &runtimeapi.ContainerConfig{ 345 Metadata: &runtimeapi.ContainerMetadata{ 346 Name: container.Name, 347 Attempt: restartCountUint32, 348 }, 349 Image: &runtimeapi.ImageSpec{Image: imageRef, UserSpecifiedImage: container.Image}, 350 Command: command, 351 Args: args, 352 WorkingDir: container.WorkingDir, 353 Labels: newContainerLabels(container, pod), 354 Annotations: newContainerAnnotations(container, pod, restartCount, opts), 355 Devices: makeDevices(opts), 356 CDIDevices: makeCDIDevices(opts), 357 Mounts: m.makeMounts(opts, container), 358 LogPath: containerLogsPath, 359 Stdin: container.Stdin, 360 StdinOnce: container.StdinOnce, 361 Tty: container.TTY, 362 } 363 364 // set platform specific configurations. 365 if err := m.applyPlatformSpecificContainerConfig(config, container, pod, uid, username, nsTarget); err != nil { 366 return nil, cleanupAction, err 367 } 368 369 // set environment variables 370 envs := make([]*runtimeapi.KeyValue, len(opts.Envs)) 371 for idx := range opts.Envs { 372 e := opts.Envs[idx] 373 envs[idx] = &runtimeapi.KeyValue{ 374 Key: e.Name, 375 Value: e.Value, 376 } 377 } 378 config.Envs = envs 379 380 return config, cleanupAction, nil 381 } 382 383 func (m *kubeGenericRuntimeManager) updateContainerResources(pod *v1.Pod, container *v1.Container, containerID kubecontainer.ContainerID) error { 384 containerResources := m.generateContainerResources(pod, container) 385 if containerResources == nil { 386 return fmt.Errorf("container %q updateContainerResources failed: cannot generate resources config", containerID.String()) 387 } 388 ctx := context.Background() 389 err := m.runtimeService.UpdateContainerResources(ctx, containerID.ID, containerResources) 390 if err != nil { 391 klog.ErrorS(err, "UpdateContainerResources failed", "container", containerID.String()) 392 } 393 return err 394 } 395 396 // makeDevices generates container devices for kubelet runtime v1. 397 func makeDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.Device { 398 devices := make([]*runtimeapi.Device, len(opts.Devices)) 399 400 for idx := range opts.Devices { 401 device := opts.Devices[idx] 402 devices[idx] = &runtimeapi.Device{ 403 HostPath: device.PathOnHost, 404 ContainerPath: device.PathInContainer, 405 Permissions: device.Permissions, 406 } 407 } 408 409 return devices 410 } 411 412 // makeCDIDevices generates container CDIDevices for kubelet runtime v1. 413 func makeCDIDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.CDIDevice { 414 devices := make([]*runtimeapi.CDIDevice, len(opts.CDIDevices)) 415 416 for i, device := range opts.CDIDevices { 417 devices[i] = &runtimeapi.CDIDevice{ 418 Name: device.Name, 419 } 420 } 421 422 return devices 423 } 424 425 // makeMounts generates container volume mounts for kubelet runtime v1. 426 func (m *kubeGenericRuntimeManager) makeMounts(opts *kubecontainer.RunContainerOptions, container *v1.Container) []*runtimeapi.Mount { 427 volumeMounts := []*runtimeapi.Mount{} 428 429 for idx := range opts.Mounts { 430 v := opts.Mounts[idx] 431 selinuxRelabel := v.SELinuxRelabel && selinux.GetEnabled() 432 mount := &runtimeapi.Mount{ 433 HostPath: v.HostPath, 434 ContainerPath: v.ContainerPath, 435 Readonly: v.ReadOnly, 436 SelinuxRelabel: selinuxRelabel, 437 Propagation: v.Propagation, 438 RecursiveReadOnly: v.RecursiveReadOnly, 439 } 440 441 volumeMounts = append(volumeMounts, mount) 442 } 443 444 // The reason we create and mount the log file in here (not in kubelet) is because 445 // the file's location depends on the ID of the container, and we need to create and 446 // mount the file before actually starting the container. 447 if opts.PodContainerDir != "" && len(container.TerminationMessagePath) != 0 { 448 // Because the PodContainerDir contains pod uid and container name which is unique enough, 449 // here we just add a random id to make the path unique for different instances 450 // of the same container. 451 cid := makeUID() 452 containerLogPath := filepath.Join(opts.PodContainerDir, cid) 453 fs, err := m.osInterface.Create(containerLogPath) 454 if err != nil { 455 utilruntime.HandleError(fmt.Errorf("error on creating termination-log file %q: %v", containerLogPath, err)) 456 } else { 457 fs.Close() 458 459 // Chmod is needed because os.Create() ends up calling 460 // open(2) to create the file, so the final mode used is "mode & 461 // ~umask". But we want to make sure the specified mode is used 462 // in the file no matter what the umask is. 463 if err := m.osInterface.Chmod(containerLogPath, 0666); err != nil { 464 utilruntime.HandleError(fmt.Errorf("unable to set termination-log file permissions %q: %v", containerLogPath, err)) 465 } 466 467 // Volume Mounts fail on Windows if it is not of the form C:/ 468 containerLogPath = volumeutil.MakeAbsolutePath(goruntime.GOOS, containerLogPath) 469 terminationMessagePath := volumeutil.MakeAbsolutePath(goruntime.GOOS, container.TerminationMessagePath) 470 selinuxRelabel := selinux.GetEnabled() 471 volumeMounts = append(volumeMounts, &runtimeapi.Mount{ 472 HostPath: containerLogPath, 473 ContainerPath: terminationMessagePath, 474 SelinuxRelabel: selinuxRelabel, 475 }) 476 } 477 } 478 479 return volumeMounts 480 } 481 482 // getKubeletContainers lists containers managed by kubelet. 483 // The boolean parameter specifies whether returns all containers including 484 // those already exited and dead containers (used for garbage collection). 485 func (m *kubeGenericRuntimeManager) getKubeletContainers(ctx context.Context, allContainers bool) ([]*runtimeapi.Container, error) { 486 filter := &runtimeapi.ContainerFilter{} 487 if !allContainers { 488 filter.State = &runtimeapi.ContainerStateValue{ 489 State: runtimeapi.ContainerState_CONTAINER_RUNNING, 490 } 491 } 492 493 containers, err := m.runtimeService.ListContainers(ctx, filter) 494 if err != nil { 495 klog.ErrorS(err, "ListContainers failed") 496 return nil, err 497 } 498 499 return containers, nil 500 } 501 502 // makeUID returns a randomly generated string. 503 func makeUID() string { 504 return fmt.Sprintf("%08x", rand.Uint32()) 505 } 506 507 // getTerminationMessage looks on the filesystem for the provided termination message path, returning a limited 508 // amount of those bytes, or returns true if the logs should be checked. 509 func getTerminationMessage(status *runtimeapi.ContainerStatus, terminationMessagePath string, fallbackToLogs bool) (string, bool) { 510 if len(terminationMessagePath) == 0 { 511 return "", fallbackToLogs 512 } 513 // Volume Mounts fail on Windows if it is not of the form C:/ 514 terminationMessagePath = volumeutil.MakeAbsolutePath(goruntime.GOOS, terminationMessagePath) 515 for _, mount := range status.Mounts { 516 if mount.ContainerPath != terminationMessagePath { 517 continue 518 } 519 path := mount.HostPath 520 data, _, err := tail.ReadAtMost(path, kubecontainer.MaxContainerTerminationMessageLength) 521 if err != nil { 522 if os.IsNotExist(err) { 523 return "", fallbackToLogs 524 } 525 return fmt.Sprintf("Error on reading termination log %s: %v", path, err), false 526 } 527 return string(data), (fallbackToLogs && len(data) == 0) 528 } 529 return "", fallbackToLogs 530 } 531 532 // readLastStringFromContainerLogs attempts to read up to the max log length from the end of the CRI log represented 533 // by path. It reads up to max log lines. 534 func (m *kubeGenericRuntimeManager) readLastStringFromContainerLogs(path string) string { 535 value := int64(kubecontainer.MaxContainerTerminationMessageLogLines) 536 buf, _ := circbuf.NewBuffer(kubecontainer.MaxContainerTerminationMessageLogLength) 537 if err := m.ReadLogs(context.Background(), path, "", &v1.PodLogOptions{TailLines: &value}, buf, buf); err != nil { 538 return fmt.Sprintf("Error on reading termination message from logs: %v", err) 539 } 540 return buf.String() 541 } 542 543 func (m *kubeGenericRuntimeManager) convertToKubeContainerStatus(status *runtimeapi.ContainerStatus) (cStatus *kubecontainer.Status) { 544 cStatus = toKubeContainerStatus(status, m.runtimeName) 545 if status.State == runtimeapi.ContainerState_CONTAINER_EXITED { 546 // Populate the termination message if needed. 547 annotatedInfo := getContainerInfoFromAnnotations(status.Annotations) 548 // If a container cannot even be started, it certainly does not have logs, so no need to fallbackToLogs. 549 fallbackToLogs := annotatedInfo.TerminationMessagePolicy == v1.TerminationMessageFallbackToLogsOnError && 550 cStatus.ExitCode != 0 && cStatus.Reason != "ContainerCannotRun" 551 tMessage, checkLogs := getTerminationMessage(status, annotatedInfo.TerminationMessagePath, fallbackToLogs) 552 if checkLogs { 553 tMessage = m.readLastStringFromContainerLogs(status.GetLogPath()) 554 } 555 // Enrich the termination message written by the application is not empty 556 if len(tMessage) != 0 { 557 if len(cStatus.Message) != 0 { 558 cStatus.Message += ": " 559 } 560 cStatus.Message += tMessage 561 } 562 } 563 return cStatus 564 } 565 566 // getPodContainerStatuses gets all containers' statuses for the pod. 567 func (m *kubeGenericRuntimeManager) getPodContainerStatuses(ctx context.Context, uid kubetypes.UID, name, namespace string) ([]*kubecontainer.Status, error) { 568 // Select all containers of the given pod. 569 containers, err := m.runtimeService.ListContainers(ctx, &runtimeapi.ContainerFilter{ 570 LabelSelector: map[string]string{kubelettypes.KubernetesPodUIDLabel: string(uid)}, 571 }) 572 if err != nil { 573 klog.ErrorS(err, "ListContainers error") 574 return nil, err 575 } 576 577 statuses := []*kubecontainer.Status{} 578 // TODO: optimization: set maximum number of containers per container name to examine. 579 for _, c := range containers { 580 resp, err := m.runtimeService.ContainerStatus(ctx, c.Id, false) 581 // Between List (ListContainers) and check (ContainerStatus) another thread might remove a container, and that is normal. 582 // The previous call (ListContainers) never fails due to a pod container not existing. 583 // Therefore, this method should not either, but instead act as if the previous call failed, 584 // which means the error should be ignored. 585 if crierror.IsNotFound(err) { 586 continue 587 } 588 if err != nil { 589 // Merely log this here; GetPodStatus will actually report the error out. 590 klog.V(4).InfoS("ContainerStatus return error", "containerID", c.Id, "err", err) 591 return nil, err 592 } 593 status := resp.GetStatus() 594 if status == nil { 595 return nil, remote.ErrContainerStatusNil 596 } 597 cStatus := m.convertToKubeContainerStatus(status) 598 statuses = append(statuses, cStatus) 599 } 600 601 sort.Sort(containerStatusByCreated(statuses)) 602 return statuses, nil 603 } 604 605 func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName string) *kubecontainer.Status { 606 annotatedInfo := getContainerInfoFromAnnotations(status.Annotations) 607 labeledInfo := getContainerInfoFromLabels(status.Labels) 608 var cStatusResources *kubecontainer.ContainerResources 609 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 610 // If runtime reports cpu & memory resources info, add it to container status 611 cStatusResources = toKubeContainerResources(status.Resources) 612 } 613 614 // Keep backwards compatibility to older runtimes, status.ImageId has been added in v1.30 615 imageID := status.ImageRef 616 if status.ImageId != "" { 617 imageID = status.ImageId 618 } 619 620 cStatus := &kubecontainer.Status{ 621 ID: kubecontainer.ContainerID{ 622 Type: runtimeName, 623 ID: status.Id, 624 }, 625 Name: labeledInfo.ContainerName, 626 Image: status.Image.Image, 627 ImageID: imageID, 628 ImageRef: status.ImageRef, 629 ImageRuntimeHandler: status.Image.RuntimeHandler, 630 Hash: annotatedInfo.Hash, 631 HashWithoutResources: annotatedInfo.HashWithoutResources, 632 RestartCount: annotatedInfo.RestartCount, 633 State: toKubeContainerState(status.State), 634 CreatedAt: time.Unix(0, status.CreatedAt), 635 Resources: cStatusResources, 636 } 637 638 if status.State != runtimeapi.ContainerState_CONTAINER_CREATED { 639 // If container is not in the created state, we have tried and 640 // started the container. Set the StartedAt time. 641 cStatus.StartedAt = time.Unix(0, status.StartedAt) 642 } 643 if status.State == runtimeapi.ContainerState_CONTAINER_EXITED { 644 cStatus.Reason = status.Reason 645 cStatus.Message = status.Message 646 cStatus.ExitCode = int(status.ExitCode) 647 cStatus.FinishedAt = time.Unix(0, status.FinishedAt) 648 } 649 return cStatus 650 } 651 652 // executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes. 653 func (m *kubeGenericRuntimeManager) executePreStopHook(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 { 654 klog.V(3).InfoS("Running preStop hook", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerSpec.Name, "containerID", containerID.String()) 655 656 start := metav1.Now() 657 done := make(chan struct{}) 658 go func() { 659 defer close(done) 660 defer utilruntime.HandleCrash() 661 if _, err := m.runner.Run(ctx, containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil { 662 klog.ErrorS(err, "PreStop hook failed", "pod", klog.KObj(pod), "podUID", pod.UID, 663 "containerName", containerSpec.Name, "containerID", containerID.String()) 664 // do not record the message in the event so that secrets won't leak from the server. 665 m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, "PreStopHook failed") 666 } 667 }() 668 669 select { 670 case <-time.After(time.Duration(gracePeriod) * time.Second): 671 klog.V(2).InfoS("PreStop hook not completed in grace period", "pod", klog.KObj(pod), "podUID", pod.UID, 672 "containerName", containerSpec.Name, "containerID", containerID.String(), "gracePeriod", gracePeriod) 673 case <-done: 674 klog.V(3).InfoS("PreStop hook completed", "pod", klog.KObj(pod), "podUID", pod.UID, 675 "containerName", containerSpec.Name, "containerID", containerID.String()) 676 } 677 678 return int64(metav1.Now().Sub(start.Time).Seconds()) 679 } 680 681 // restoreSpecsFromContainerLabels restores all information needed for killing a container. In some 682 // case we may not have pod and container spec when killing a container, e.g. pod is deleted during 683 // kubelet restart. 684 // To solve this problem, we've already written necessary information into container labels. Here we 685 // just need to retrieve them from container labels and restore the specs. 686 // TODO(random-liu): Add a node e2e test to test this behaviour. 687 // TODO(random-liu): Change the lifecycle handler to just accept information needed, so that we can 688 // just pass the needed function not create the fake object. 689 func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(ctx context.Context, containerID kubecontainer.ContainerID) (*v1.Pod, *v1.Container, error) { 690 var pod *v1.Pod 691 var container *v1.Container 692 resp, err := m.runtimeService.ContainerStatus(ctx, containerID.ID, false) 693 if err != nil { 694 return nil, nil, err 695 } 696 s := resp.GetStatus() 697 if s == nil { 698 return nil, nil, remote.ErrContainerStatusNil 699 } 700 701 l := getContainerInfoFromLabels(s.Labels) 702 a := getContainerInfoFromAnnotations(s.Annotations) 703 // Notice that the followings are not full spec. The container killing code should not use 704 // un-restored fields. 705 pod = &v1.Pod{ 706 ObjectMeta: metav1.ObjectMeta{ 707 UID: l.PodUID, 708 Name: l.PodName, 709 Namespace: l.PodNamespace, 710 DeletionGracePeriodSeconds: a.PodDeletionGracePeriod, 711 }, 712 Spec: v1.PodSpec{ 713 TerminationGracePeriodSeconds: a.PodTerminationGracePeriod, 714 }, 715 } 716 container = &v1.Container{ 717 Name: l.ContainerName, 718 Ports: a.ContainerPorts, 719 TerminationMessagePath: a.TerminationMessagePath, 720 } 721 if a.PreStopHandler != nil { 722 container.Lifecycle = &v1.Lifecycle{ 723 PreStop: a.PreStopHandler, 724 } 725 } 726 return pod, container, nil 727 } 728 729 // killContainer kills a container through the following steps: 730 // * Run the pre-stop lifecycle hooks (if applicable). 731 // * Stop the container. 732 func (m *kubeGenericRuntimeManager) killContainer(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, reason containerKillReason, gracePeriodOverride *int64, ordering *terminationOrdering) error { 733 var containerSpec *v1.Container 734 if pod != nil { 735 if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil { 736 return fmt.Errorf("failed to get containerSpec %q (id=%q) in pod %q when killing container for reason %q", 737 containerName, containerID.String(), format.Pod(pod), message) 738 } 739 } else { 740 // Restore necessary information if one of the specs is nil. 741 restoredPod, restoredContainer, err := m.restoreSpecsFromContainerLabels(ctx, containerID) 742 if err != nil { 743 return err 744 } 745 pod, containerSpec = restoredPod, restoredContainer 746 } 747 748 // From this point, pod and container must be non-nil. 749 gracePeriod := setTerminationGracePeriod(pod, containerSpec, containerName, containerID, reason) 750 751 if len(message) == 0 { 752 message = fmt.Sprintf("Stopping container %s", containerSpec.Name) 753 } 754 m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message) 755 756 if gracePeriodOverride != nil { 757 gracePeriod = *gracePeriodOverride 758 klog.V(3).InfoS("Killing container with a grace period override", "pod", klog.KObj(pod), "podUID", pod.UID, 759 "containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod) 760 } 761 762 // Run the pre-stop lifecycle hooks if applicable and if there is enough time to run it 763 if containerSpec.Lifecycle != nil && containerSpec.Lifecycle.PreStop != nil && gracePeriod > 0 { 764 gracePeriod = gracePeriod - m.executePreStopHook(ctx, pod, containerID, containerSpec, gracePeriod) 765 } 766 767 // if we care about termination ordering, then wait for this container's turn to exit if there is 768 // time remaining 769 if ordering != nil && gracePeriod > 0 { 770 // grace period is only in seconds, so the time we've waited gets truncated downward 771 gracePeriod -= int64(ordering.waitForTurn(containerName, gracePeriod)) 772 } 773 774 // always give containers a minimal shutdown window to avoid unnecessary SIGKILLs 775 if gracePeriod < minimumGracePeriodInSeconds { 776 gracePeriod = minimumGracePeriodInSeconds 777 } 778 779 klog.V(2).InfoS("Killing container with a grace period", "pod", klog.KObj(pod), "podUID", pod.UID, 780 "containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod) 781 782 err := m.runtimeService.StopContainer(ctx, containerID.ID, gracePeriod) 783 if err != nil && !crierror.IsNotFound(err) { 784 klog.ErrorS(err, "Container termination failed with gracePeriod", "pod", klog.KObj(pod), "podUID", pod.UID, 785 "containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod) 786 return err 787 } 788 klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID, 789 "containerName", containerName, "containerID", containerID.String()) 790 791 if ordering != nil { 792 ordering.containerTerminated(containerName) 793 } 794 795 return nil 796 } 797 798 // killContainersWithSyncResult kills all pod's containers with sync results. 799 func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(ctx context.Context, pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (syncResults []*kubecontainer.SyncResult) { 800 containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers)) 801 wg := sync.WaitGroup{} 802 803 wg.Add(len(runningPod.Containers)) 804 var termOrdering *terminationOrdering 805 // we only care about container termination ordering if the sidecars feature is enabled 806 if utilfeature.DefaultFeatureGate.Enabled(features.SidecarContainers) { 807 var runningContainerNames []string 808 for _, container := range runningPod.Containers { 809 runningContainerNames = append(runningContainerNames, container.Name) 810 } 811 termOrdering = newTerminationOrdering(pod, runningContainerNames) 812 } 813 for _, container := range runningPod.Containers { 814 go func(container *kubecontainer.Container) { 815 defer utilruntime.HandleCrash() 816 defer wg.Done() 817 818 killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name) 819 if err := m.killContainer(ctx, pod, container.ID, container.Name, "", reasonUnknown, gracePeriodOverride, termOrdering); err != nil { 820 killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) 821 // Use runningPod for logging as the pod passed in could be *nil*. 822 klog.ErrorS(err, "Kill container failed", "pod", klog.KRef(runningPod.Namespace, runningPod.Name), "podUID", runningPod.ID, 823 "containerName", container.Name, "containerID", container.ID) 824 } 825 containerResults <- killContainerResult 826 }(container) 827 } 828 wg.Wait() 829 close(containerResults) 830 831 for containerResult := range containerResults { 832 syncResults = append(syncResults, containerResult) 833 } 834 return 835 } 836 837 // pruneInitContainersBeforeStart ensures that before we begin creating init 838 // containers, we have reduced the number of outstanding init containers still 839 // present. This reduces load on the container garbage collector by only 840 // preserving the most recent terminated init container. 841 func (m *kubeGenericRuntimeManager) pruneInitContainersBeforeStart(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) { 842 // only the last execution of each init container should be preserved, and only preserve it if it is in the 843 // list of init containers to keep. 844 initContainerNames := sets.NewString() 845 for _, container := range pod.Spec.InitContainers { 846 initContainerNames.Insert(container.Name) 847 } 848 for name := range initContainerNames { 849 count := 0 850 for _, status := range podStatus.ContainerStatuses { 851 if status.Name != name || 852 (status.State != kubecontainer.ContainerStateExited && 853 status.State != kubecontainer.ContainerStateUnknown) { 854 continue 855 } 856 // Remove init containers in unknown state. It should have 857 // been stopped before pruneInitContainersBeforeStart is 858 // called. 859 count++ 860 // keep the first init container for this name 861 if count == 1 { 862 continue 863 } 864 // prune all other init containers that match this container name 865 klog.V(4).InfoS("Removing init container", "containerName", status.Name, "containerID", status.ID.ID, "count", count) 866 if err := m.removeContainer(ctx, status.ID.ID); err != nil { 867 utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod))) 868 continue 869 } 870 } 871 } 872 } 873 874 // Remove all init containers. Note that this function does not check the state 875 // of the container because it assumes all init containers have been stopped 876 // before the call happens. 877 func (m *kubeGenericRuntimeManager) purgeInitContainers(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) { 878 initContainerNames := sets.NewString() 879 for _, container := range pod.Spec.InitContainers { 880 initContainerNames.Insert(container.Name) 881 } 882 for name := range initContainerNames { 883 count := 0 884 for _, status := range podStatus.ContainerStatuses { 885 if status.Name != name { 886 continue 887 } 888 count++ 889 // Purge all init containers that match this container name 890 klog.V(4).InfoS("Removing init container", "containerName", status.Name, "containerID", status.ID.ID, "count", count) 891 if err := m.removeContainer(ctx, status.ID.ID); err != nil { 892 utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod))) 893 continue 894 } 895 } 896 } 897 } 898 899 // findNextInitContainerToRun returns the status of the last failed container, the 900 // index of next init container to start, or done if there are no further init containers. 901 // Status is only returned if an init container is failed, in which case next will 902 // point to the current container. 903 func findNextInitContainerToRun(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (status *kubecontainer.Status, next *v1.Container, done bool) { 904 if len(pod.Spec.InitContainers) == 0 { 905 return nil, nil, true 906 } 907 908 // If any of the main containers have status and are Running, then all init containers must 909 // have been executed at some point in the past. However, they could have been removed 910 // from the container runtime now, and if we proceed, it would appear as if they 911 // never ran and will re-execute improperly. 912 for i := range pod.Spec.Containers { 913 container := &pod.Spec.Containers[i] 914 status := podStatus.FindContainerStatusByName(container.Name) 915 if status != nil && status.State == kubecontainer.ContainerStateRunning { 916 return nil, nil, true 917 } 918 } 919 920 // If there are failed containers, return the status of the last failed one. 921 for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- { 922 container := &pod.Spec.InitContainers[i] 923 status := podStatus.FindContainerStatusByName(container.Name) 924 if status != nil && isInitContainerFailed(status) { 925 return status, container, false 926 } 927 } 928 929 // There are no failed containers now. 930 for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- { 931 container := &pod.Spec.InitContainers[i] 932 status := podStatus.FindContainerStatusByName(container.Name) 933 if status == nil { 934 continue 935 } 936 937 // container is still running, return not done. 938 if status.State == kubecontainer.ContainerStateRunning { 939 return nil, nil, false 940 } 941 942 if status.State == kubecontainer.ContainerStateExited { 943 // all init containers successful 944 if i == (len(pod.Spec.InitContainers) - 1) { 945 return nil, nil, true 946 } 947 948 // all containers up to i successful, go to i+1 949 return nil, &pod.Spec.InitContainers[i+1], false 950 } 951 } 952 953 return nil, &pod.Spec.InitContainers[0], false 954 } 955 956 // hasAnyRegularContainerCreated returns true if any regular container has been 957 // created, which indicates all init containers have been initialized. 958 func hasAnyRegularContainerCreated(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool { 959 for _, container := range pod.Spec.Containers { 960 status := podStatus.FindContainerStatusByName(container.Name) 961 if status == nil { 962 continue 963 } 964 switch status.State { 965 case kubecontainer.ContainerStateCreated, 966 kubecontainer.ContainerStateRunning, 967 kubecontainer.ContainerStateExited: 968 return true 969 default: 970 // Ignore other states 971 } 972 } 973 return false 974 } 975 976 // computeInitContainerActions sets the actions on the given changes that need 977 // to be taken for the init containers. This includes actions to initialize the 978 // init containers and actions to keep restartable init containers running. 979 // computeInitContainerActions returns true if pod has been initialized. 980 // 981 // The actions include: 982 // - Start the first init container that has not been started. 983 // - Restart all restartable init containers that have started but are not running. 984 // - Kill the restartable init containers that are not alive or started. 985 // 986 // Note that this is a function for the SidecarContainers feature. 987 // Please sync with the findNextInitContainerToRun function if any changes are 988 // made, as either this or that function will be called. 989 func (m *kubeGenericRuntimeManager) computeInitContainerActions(pod *v1.Pod, podStatus *kubecontainer.PodStatus, changes *podActions) bool { 990 if len(pod.Spec.InitContainers) == 0 { 991 return true 992 } 993 994 // If any of the main containers have status and are Running, then all init containers must 995 // have been executed at some point in the past. However, they could have been removed 996 // from the container runtime now, and if we proceed, it would appear as if they 997 // never ran and will re-execute improperly except for the restartable init containers. 998 podHasInitialized := false 999 for _, container := range pod.Spec.Containers { 1000 status := podStatus.FindContainerStatusByName(container.Name) 1001 if status == nil { 1002 continue 1003 } 1004 switch status.State { 1005 case kubecontainer.ContainerStateCreated, 1006 kubecontainer.ContainerStateRunning: 1007 podHasInitialized = true 1008 case kubecontainer.ContainerStateExited: 1009 // This is a workaround for the issue that the kubelet cannot 1010 // differentiate the container statuses of the previous podSandbox 1011 // from the current one. 1012 // If the node is rebooted, all containers will be in the exited 1013 // state and the kubelet will try to recreate a new podSandbox. 1014 // In this case, the kubelet should not mistakenly think that 1015 // the newly created podSandbox has been initialized. 1016 default: 1017 // Ignore other states 1018 } 1019 if podHasInitialized { 1020 break 1021 } 1022 } 1023 1024 // isPreviouslyInitialized indicates if the current init container is 1025 // previously initialized. 1026 isPreviouslyInitialized := podHasInitialized 1027 restartOnFailure := shouldRestartOnFailure(pod) 1028 1029 // Note that we iterate through the init containers in reverse order to find 1030 // the next init container to run, as the completed init containers may get 1031 // removed from container runtime for various reasons. Therefore the kubelet 1032 // should rely on the minimal number of init containers - the last one. 1033 // 1034 // Once we find the next init container to run, iterate through the rest to 1035 // find the restartable init containers to restart. 1036 for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- { 1037 container := &pod.Spec.InitContainers[i] 1038 status := podStatus.FindContainerStatusByName(container.Name) 1039 klog.V(4).InfoS("Computing init container action", "pod", klog.KObj(pod), "container", container.Name, "status", status) 1040 if status == nil { 1041 // If the container is previously initialized but its status is not 1042 // found, it means its last status is removed for some reason. 1043 // Restart it if it is a restartable init container. 1044 if isPreviouslyInitialized && types.IsRestartableInitContainer(container) { 1045 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1046 } 1047 continue 1048 } 1049 1050 if isPreviouslyInitialized && !types.IsRestartableInitContainer(container) { 1051 // after initialization, only restartable init containers need to be kept 1052 // running 1053 continue 1054 } 1055 1056 switch status.State { 1057 case kubecontainer.ContainerStateCreated: 1058 // nothing to do but wait for it to start 1059 1060 case kubecontainer.ContainerStateRunning: 1061 if !types.IsRestartableInitContainer(container) { 1062 break 1063 } 1064 1065 if types.IsRestartableInitContainer(container) { 1066 if container.StartupProbe != nil { 1067 startup, found := m.startupManager.Get(status.ID) 1068 if !found { 1069 // If the startup probe has not been run, wait for it. 1070 break 1071 } 1072 if startup != proberesults.Success { 1073 if startup == proberesults.Failure { 1074 // If the restartable init container failed the startup probe, 1075 // restart it. 1076 changes.ContainersToKill[status.ID] = containerToKillInfo{ 1077 name: container.Name, 1078 container: container, 1079 message: fmt.Sprintf("Init container %s failed startup probe", container.Name), 1080 reason: reasonStartupProbe, 1081 } 1082 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1083 } 1084 break 1085 } 1086 } 1087 1088 klog.V(4).InfoS("Init container has been initialized", "pod", klog.KObj(pod), "container", container.Name) 1089 if i == (len(pod.Spec.InitContainers) - 1) { 1090 podHasInitialized = true 1091 } else if !isPreviouslyInitialized { 1092 // this init container is initialized for the first time, start the next one 1093 changes.InitContainersToStart = append(changes.InitContainersToStart, i+1) 1094 } 1095 1096 // A restartable init container does not have to take into account its 1097 // liveness probe when it determines to start the next init container. 1098 if container.LivenessProbe != nil { 1099 liveness, found := m.livenessManager.Get(status.ID) 1100 if !found { 1101 // If the liveness probe has not been run, wait for it. 1102 break 1103 } 1104 if liveness == proberesults.Failure { 1105 // If the restartable init container failed the liveness probe, 1106 // restart it. 1107 changes.ContainersToKill[status.ID] = containerToKillInfo{ 1108 name: container.Name, 1109 container: container, 1110 message: fmt.Sprintf("Init container %s failed liveness probe", container.Name), 1111 reason: reasonLivenessProbe, 1112 } 1113 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1114 } 1115 } 1116 } else { // init container 1117 // nothing do to but wait for it to finish 1118 break 1119 } 1120 1121 // If the init container failed and the restart policy is Never, the pod is terminal. 1122 // Otherwise, restart the init container. 1123 case kubecontainer.ContainerStateExited: 1124 if types.IsRestartableInitContainer(container) { 1125 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1126 } else { // init container 1127 if isInitContainerFailed(status) { 1128 if !restartOnFailure { 1129 changes.KillPod = true 1130 changes.InitContainersToStart = nil 1131 return false 1132 } 1133 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1134 break 1135 } 1136 1137 klog.V(4).InfoS("Init container has been initialized", "pod", klog.KObj(pod), "container", container.Name) 1138 if i == (len(pod.Spec.InitContainers) - 1) { 1139 podHasInitialized = true 1140 } else { 1141 // this init container is initialized for the first time, start the next one 1142 changes.InitContainersToStart = append(changes.InitContainersToStart, i+1) 1143 } 1144 } 1145 1146 default: // kubecontainer.ContainerStatusUnknown or other unknown states 1147 if types.IsRestartableInitContainer(container) { 1148 // If the restartable init container is in unknown state, restart it. 1149 changes.ContainersToKill[status.ID] = containerToKillInfo{ 1150 name: container.Name, 1151 container: container, 1152 message: fmt.Sprintf("Init container is in %q state, try killing it before restart", 1153 status.State), 1154 reason: reasonUnknown, 1155 } 1156 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1157 } else { // init container 1158 if !isInitContainerFailed(status) { 1159 klog.V(4).InfoS("This should not happen, init container is in unknown state but not failed", "pod", klog.KObj(pod), "containerStatus", status) 1160 } 1161 1162 if !restartOnFailure { 1163 changes.KillPod = true 1164 changes.InitContainersToStart = nil 1165 return false 1166 } 1167 1168 // If the init container is in unknown state, restart it. 1169 changes.ContainersToKill[status.ID] = containerToKillInfo{ 1170 name: container.Name, 1171 container: container, 1172 message: fmt.Sprintf("Init container is in %q state, try killing it before restart", 1173 status.State), 1174 reason: reasonUnknown, 1175 } 1176 changes.InitContainersToStart = append(changes.InitContainersToStart, i) 1177 } 1178 } 1179 1180 if !isPreviouslyInitialized { 1181 // the one before this init container has been initialized 1182 isPreviouslyInitialized = true 1183 } 1184 } 1185 1186 // this means no init containers have been started, 1187 // start the first one 1188 if !isPreviouslyInitialized { 1189 changes.InitContainersToStart = append(changes.InitContainersToStart, 0) 1190 } 1191 1192 // reverse the InitContainersToStart, as the above loop iterated through the 1193 // init containers backwards, but we want to start them as per the order in 1194 // the pod spec. 1195 l := len(changes.InitContainersToStart) 1196 for i := 0; i < l/2; i++ { 1197 changes.InitContainersToStart[i], changes.InitContainersToStart[l-1-i] = 1198 changes.InitContainersToStart[l-1-i], changes.InitContainersToStart[i] 1199 } 1200 1201 return podHasInitialized 1202 } 1203 1204 // GetContainerLogs returns logs of a specific container. 1205 func (m *kubeGenericRuntimeManager) GetContainerLogs(ctx context.Context, pod *v1.Pod, containerID kubecontainer.ContainerID, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) (err error) { 1206 resp, err := m.runtimeService.ContainerStatus(ctx, containerID.ID, false) 1207 if err != nil { 1208 klog.V(4).InfoS("Failed to get container status", "containerID", containerID.String(), "err", err) 1209 return fmt.Errorf("unable to retrieve container logs for %v", containerID.String()) 1210 } 1211 status := resp.GetStatus() 1212 if status == nil { 1213 return remote.ErrContainerStatusNil 1214 } 1215 return m.ReadLogs(ctx, status.GetLogPath(), containerID.ID, logOptions, stdout, stderr) 1216 } 1217 1218 // GetExec gets the endpoint the runtime will serve the exec request from. 1219 func (m *kubeGenericRuntimeManager) GetExec(ctx context.Context, id kubecontainer.ContainerID, cmd []string, stdin, stdout, stderr, tty bool) (*url.URL, error) { 1220 req := &runtimeapi.ExecRequest{ 1221 ContainerId: id.ID, 1222 Cmd: cmd, 1223 Tty: tty, 1224 Stdin: stdin, 1225 Stdout: stdout, 1226 Stderr: stderr, 1227 } 1228 resp, err := m.runtimeService.Exec(ctx, req) 1229 if err != nil { 1230 return nil, err 1231 } 1232 1233 return url.Parse(resp.Url) 1234 } 1235 1236 // GetAttach gets the endpoint the runtime will serve the attach request from. 1237 func (m *kubeGenericRuntimeManager) GetAttach(ctx context.Context, id kubecontainer.ContainerID, stdin, stdout, stderr, tty bool) (*url.URL, error) { 1238 req := &runtimeapi.AttachRequest{ 1239 ContainerId: id.ID, 1240 Stdin: stdin, 1241 Stdout: stdout, 1242 Stderr: stderr, 1243 Tty: tty, 1244 } 1245 resp, err := m.runtimeService.Attach(ctx, req) 1246 if err != nil { 1247 return nil, err 1248 } 1249 return url.Parse(resp.Url) 1250 } 1251 1252 // RunInContainer synchronously executes the command in the container, and returns the output. 1253 func (m *kubeGenericRuntimeManager) RunInContainer(ctx context.Context, id kubecontainer.ContainerID, cmd []string, timeout time.Duration) ([]byte, error) { 1254 stdout, stderr, err := m.runtimeService.ExecSync(ctx, id.ID, cmd, timeout) 1255 // NOTE(tallclair): This does not correctly interleave stdout & stderr, but should be sufficient 1256 // for logging purposes. A combined output option will need to be added to the ExecSyncRequest 1257 // if more precise output ordering is ever required. 1258 return append(stdout, stderr...), err 1259 } 1260 1261 // removeContainer removes the container and the container logs. 1262 // Notice that we remove the container logs first, so that container will not be removed if 1263 // container logs are failed to be removed, and kubelet will retry this later. This guarantees 1264 // that container logs to be removed with the container. 1265 // Notice that we assume that the container should only be removed in non-running state, and 1266 // it will not write container logs anymore in that state. 1267 func (m *kubeGenericRuntimeManager) removeContainer(ctx context.Context, containerID string) error { 1268 klog.V(4).InfoS("Removing container", "containerID", containerID) 1269 // Call internal container post-stop lifecycle hook. 1270 if err := m.internalLifecycle.PostStopContainer(containerID); err != nil { 1271 return err 1272 } 1273 1274 // Remove the container log. 1275 // TODO: Separate log and container lifecycle management. 1276 if err := m.removeContainerLog(ctx, containerID); err != nil { 1277 return err 1278 } 1279 // Remove the container. 1280 return m.runtimeService.RemoveContainer(ctx, containerID) 1281 } 1282 1283 // removeContainerLog removes the container log. 1284 func (m *kubeGenericRuntimeManager) removeContainerLog(ctx context.Context, containerID string) error { 1285 // Use log manager to remove rotated logs. 1286 err := m.logManager.Clean(ctx, containerID) 1287 if err != nil { 1288 return err 1289 } 1290 1291 resp, err := m.runtimeService.ContainerStatus(ctx, containerID, false) 1292 if err != nil { 1293 return fmt.Errorf("failed to get container status %q: %v", containerID, err) 1294 } 1295 status := resp.GetStatus() 1296 if status == nil { 1297 return remote.ErrContainerStatusNil 1298 } 1299 // Remove the legacy container log symlink. 1300 // TODO(random-liu): Remove this after cluster logging supports CRI container log path. 1301 labeledInfo := getContainerInfoFromLabels(status.Labels) 1302 legacySymlink := legacyLogSymlink(containerID, labeledInfo.ContainerName, labeledInfo.PodName, 1303 labeledInfo.PodNamespace) 1304 if err := m.osInterface.Remove(legacySymlink); err != nil && !os.IsNotExist(err) { 1305 return fmt.Errorf("failed to remove container %q log legacy symbolic link %q: %v", 1306 containerID, legacySymlink, err) 1307 } 1308 return nil 1309 } 1310 1311 // DeleteContainer removes a container. 1312 func (m *kubeGenericRuntimeManager) DeleteContainer(ctx context.Context, containerID kubecontainer.ContainerID) error { 1313 return m.removeContainer(ctx, containerID.ID) 1314 } 1315 1316 // setTerminationGracePeriod determines the grace period to use when killing a container 1317 func setTerminationGracePeriod(pod *v1.Pod, containerSpec *v1.Container, containerName string, containerID kubecontainer.ContainerID, reason containerKillReason) int64 { 1318 gracePeriod := int64(minimumGracePeriodInSeconds) 1319 switch { 1320 case pod.DeletionGracePeriodSeconds != nil: 1321 return *pod.DeletionGracePeriodSeconds 1322 case pod.Spec.TerminationGracePeriodSeconds != nil: 1323 switch reason { 1324 case reasonStartupProbe: 1325 if isProbeTerminationGracePeriodSecondsSet(pod, containerSpec, containerSpec.StartupProbe, containerName, containerID, "StartupProbe") { 1326 return *containerSpec.StartupProbe.TerminationGracePeriodSeconds 1327 } 1328 case reasonLivenessProbe: 1329 if isProbeTerminationGracePeriodSecondsSet(pod, containerSpec, containerSpec.LivenessProbe, containerName, containerID, "LivenessProbe") { 1330 return *containerSpec.LivenessProbe.TerminationGracePeriodSeconds 1331 } 1332 } 1333 return *pod.Spec.TerminationGracePeriodSeconds 1334 } 1335 return gracePeriod 1336 } 1337 1338 func isProbeTerminationGracePeriodSecondsSet(pod *v1.Pod, containerSpec *v1.Container, probe *v1.Probe, containerName string, containerID kubecontainer.ContainerID, probeType string) bool { 1339 if probe != nil && probe.TerminationGracePeriodSeconds != nil { 1340 if *probe.TerminationGracePeriodSeconds > *pod.Spec.TerminationGracePeriodSeconds { 1341 klog.V(4).InfoS("Using probe-level grace period that is greater than the pod-level grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerName, "containerID", containerID.String(), "probeType", probeType, "probeGracePeriod", *probe.TerminationGracePeriodSeconds, "podGracePeriod", *pod.Spec.TerminationGracePeriodSeconds) 1342 } 1343 return true 1344 } 1345 return false 1346 }