github.com/GoogleContainerTools/skaffold/v2@v2.13.2/pkg/diag/validator/validator.go (about)

     1  /*
     2  Copyright 2019 The Skaffold Authors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package validator
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os/exec"
    23  	"regexp"
    24  	"strings"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apimachinery/pkg/runtime"
    29  	"k8s.io/client-go/kubernetes"
    30  	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    31  	deploymentutil "k8s.io/kubectl/pkg/util/deployment"
    32  
    33  	"github.com/GoogleContainerTools/skaffold/v2/pkg/diag/recommender"
    34  	"github.com/GoogleContainerTools/skaffold/v2/pkg/skaffold/output/log"
    35  	proto "github.com/GoogleContainerTools/skaffold/v2/proto/v1"
    36  )
    37  
    38  const (
    39  	success             = "Succeeded"
    40  	running             = "Running"
    41  	actionableMessage   = `could not determine pod status. Try kubectl describe -n %s po/%s`
    42  	errorPrefix         = `(?P<Prefix>)(?P<DaemonLog>Error response from daemon\:)(?P<Error>.*)`
    43  	taintsExp           = `\{(?P<taint>.*?):.*?}`
    44  	crashLoopBackOff    = "CrashLoopBackOff"
    45  	runContainerError   = "RunContainerError"
    46  	ImagePullErr        = "ErrImagePull"
    47  	ImagePullBackOff    = "ImagePullBackOff"
    48  	ErrImagePullBackOff = "ErrImagePullBackOff"
    49  
    50  	ReplicaFailureAdmissionErr = "ReplicaFailureAdmissionErr"
    51  	containerCreating          = "ContainerCreating"
    52  	podInitializing            = "PodInitializing"
    53  	podKind                    = "pod"
    54  
    55  	failedScheduling = "FailedScheduling"
    56  	unhealthy        = "Unhealthy"
    57  	execFmtError     = "exec format error"
    58  )
    59  
    60  var (
    61  	runContainerRe = regexp.MustCompile(errorPrefix)
    62  	taintsRe       = regexp.MustCompile(taintsExp)
    63  	// for testing
    64  	runCli        = executeCLI
    65  	getReplicaSet = deploymentutil.GetAllReplicaSets
    66  
    67  	unknownConditionsOrSuccess = map[proto.StatusCode]struct{}{
    68  		proto.StatusCode_STATUSCHECK_UNKNOWN:                   {},
    69  		proto.StatusCode_STATUSCHECK_CONTAINER_WAITING_UNKNOWN: {},
    70  		proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE:     {},
    71  		proto.StatusCode_STATUSCHECK_SUCCESS:                   {},
    72  		proto.StatusCode_STATUSCHECK_POD_INITIALIZING:          {},
    73  	}
    74  )
    75  
    76  // PodValidator implements the Validator interface for Pods
    77  type PodValidator struct {
    78  	k           kubernetes.Interface
    79  	podSelector PodSelector
    80  	recos       []Recommender
    81  }
    82  
    83  // NewPodValidator initializes a PodValidator
    84  func NewPodValidator(k kubernetes.Interface, s PodSelector) *PodValidator {
    85  	rs := []Recommender{recommender.ContainerError{}}
    86  	return &PodValidator{k: k, recos: rs, podSelector: s}
    87  }
    88  
    89  // Validate implements the Validate method for Validator interface
    90  func (p *PodValidator) Validate(ctx context.Context, ns string, opts metav1.ListOptions) ([]Resource, error) {
    91  	pods, err := p.podSelector.Select(ctx, ns, opts)
    92  	if err != nil {
    93  		return []Resource{}, err
    94  	}
    95  	eventsClient := p.k.CoreV1().Events(ns)
    96  	var rs []Resource
    97  	for _, po := range pods {
    98  		ps := p.getPodStatus(&po)
    99  		// Update Pod status from Pod events if required
   100  		updated := processPodEvents(eventsClient, po, ps)
   101  		// The GVK group is not populated for List Objects. Hence set `kind` to `pod`
   102  		// See https://github.com/kubernetes-sigs/controller-runtime/pull/389
   103  		if po.Kind == "" {
   104  			po.Kind = podKind
   105  		}
   106  		// Add recommendations
   107  		for _, r := range p.recos {
   108  			if s := r.Make(updated.ae.ErrCode); s.SuggestionCode != proto.SuggestionCode_NIL {
   109  				updated.ae.Suggestions = append(updated.ae.Suggestions, s)
   110  			}
   111  		}
   112  		rs = append(rs, NewResourceFromObject(&po, Status(updated.phase), &updated.ae, updated.logs))
   113  	}
   114  	return rs, nil
   115  }
   116  
   117  func (p *PodValidator) getPodStatus(pod *v1.Pod) *podStatus {
   118  	ps := newPodStatus(pod.Name, pod.Namespace, string(pod.Status.Phase))
   119  	switch pod.Status.Phase {
   120  	case v1.PodSucceeded:
   121  		return ps
   122  	default:
   123  		return ps.withErrAndLogs(getPodStatus(pod))
   124  	}
   125  }
   126  
   127  func getPodStatus(pod *v1.Pod) (proto.StatusCode, []string, error) {
   128  	// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions
   129  
   130  	// If the event type PodReady with status True is found then we return success immediately
   131  	if isPodReady(pod) {
   132  		return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil
   133  	}
   134  	// If the event type PodScheduled with status False is found then we check if it is due to taints and tolerations.
   135  	if c, ok := isPodNotScheduled(pod); ok {
   136  		log.Entry(context.TODO()).Debugf("Pod %q not scheduled: checking tolerations", pod.Name)
   137  		sc, err := getUntoleratedTaints(c.Reason, c.Message)
   138  		return sc, nil, err
   139  	}
   140  	// we can check the container status if the pod has been scheduled successfully. This can be determined by having the event
   141  	// PodScheduled with status True, or a ContainerReady or PodReady event with status False.
   142  	if isPodScheduledButNotReady(pod) {
   143  		log.Entry(context.TODO()).Debugf("Pod %q scheduled but not ready: checking container statuses", pod.Name)
   144  		// TODO(dgageot): Add EphemeralContainerStatuses
   145  		cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
   146  		// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
   147  		statusCode, logs, err := getContainerStatus(pod, cs)
   148  		if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING {
   149  			// Determine if an init container is still running and fetch the init logs.
   150  			for _, c := range pod.Status.InitContainerStatuses {
   151  				if c.State.Waiting != nil {
   152  					return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name)
   153  				} else if c.State.Running != nil {
   154  					sc, l := getPodLogs(pod, c.Name, statusCode)
   155  					return sc, l, fmt.Errorf("waiting for init container %s to complete", c.Name)
   156  				}
   157  			}
   158  		}
   159  		return statusCode, logs, err
   160  	}
   161  
   162  	if c, ok := isPodStatusUnknown(pod); ok {
   163  		log.Entry(context.TODO()).Debugf("Pod %q condition status of type %s is unknown", pod.Name, c.Type)
   164  		return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message)
   165  	}
   166  
   167  	log.Entry(context.TODO()).Debugf("Unable to determine current service state of pod %q", pod.Name)
   168  	return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf("unable to determine current service state of pod %q", pod.Name)
   169  }
   170  
   171  func isPodReady(pod *v1.Pod) bool {
   172  	for _, c := range pod.Status.Conditions {
   173  		if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
   174  			return true
   175  		}
   176  	}
   177  	return false
   178  }
   179  
   180  func isPodNotScheduled(pod *v1.Pod) (v1.PodCondition, bool) {
   181  	for _, c := range pod.Status.Conditions {
   182  		if c.Type == v1.PodScheduled && c.Status == v1.ConditionFalse {
   183  			return c, true
   184  		}
   185  	}
   186  	return v1.PodCondition{}, false
   187  }
   188  
   189  func isPodScheduledButNotReady(pod *v1.Pod) bool {
   190  	for _, c := range pod.Status.Conditions {
   191  		if c.Type == v1.PodScheduled && c.Status == v1.ConditionTrue {
   192  			return true
   193  		}
   194  		if c.Type == v1.ContainersReady && c.Status == v1.ConditionFalse {
   195  			return true
   196  		}
   197  		if c.Type == v1.PodReady && c.Status == v1.ConditionFalse {
   198  			return true
   199  		}
   200  	}
   201  	return false
   202  }
   203  
   204  func isPodStatusUnknown(pod *v1.Pod) (v1.PodCondition, bool) {
   205  	for _, c := range pod.Status.Conditions {
   206  		if c.Status == v1.ConditionUnknown {
   207  			return c, true
   208  		}
   209  	}
   210  	return v1.PodCondition{}, false
   211  }
   212  
   213  func getContainerStatus(po *v1.Pod, cs []v1.ContainerStatus) (proto.StatusCode, []string, error) {
   214  	// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
   215  	for _, c := range cs {
   216  		switch {
   217  		case c.State.Waiting != nil:
   218  			return extractErrorMessageFromWaitingContainerStatus(po, c)
   219  		case c.State.Terminated != nil && c.State.Terminated.ExitCode != 0:
   220  			sc, l := getPodLogs(po, c.Name, proto.StatusCode_STATUSCHECK_CONTAINER_TERMINATED)
   221  			return sc, l, fmt.Errorf("container %s terminated with exit code %d", c.Name, c.State.Terminated.ExitCode)
   222  		}
   223  	}
   224  	// No waiting or terminated containers, pod should be in good health.
   225  	return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil
   226  }
   227  
   228  func getUntoleratedTaints(reason string, message string) (proto.StatusCode, error) {
   229  	matches := taintsRe.FindAllStringSubmatch(message, -1)
   230  	errCode := proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE
   231  	if len(matches) == 0 {
   232  		return errCode, fmt.Errorf("%s: %s", reason, message)
   233  	}
   234  	messages := make([]string, len(matches))
   235  	// TODO: Add actionable item to fix these errors.
   236  	for i, m := range matches {
   237  		if len(m) < 2 {
   238  			continue
   239  		}
   240  		t := m[1]
   241  		switch t {
   242  		case v1.TaintNodeMemoryPressure:
   243  			messages[i] = "1 node has memory pressure"
   244  			errCode = proto.StatusCode_STATUSCHECK_NODE_MEMORY_PRESSURE
   245  		case v1.TaintNodeDiskPressure:
   246  			messages[i] = "1 node has disk pressure"
   247  			errCode = proto.StatusCode_STATUSCHECK_NODE_DISK_PRESSURE
   248  		case v1.TaintNodePIDPressure:
   249  			messages[i] = "1 node has PID pressure"
   250  			errCode = proto.StatusCode_STATUSCHECK_NODE_PID_PRESSURE
   251  		case v1.TaintNodeNotReady:
   252  			messages[i] = "1 node is not ready"
   253  			if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE {
   254  				errCode = proto.StatusCode_STATUSCHECK_NODE_NOT_READY
   255  			}
   256  		case v1.TaintNodeUnreachable:
   257  			messages[i] = "1 node is unreachable"
   258  			if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE {
   259  				errCode = proto.StatusCode_STATUSCHECK_NODE_UNREACHABLE
   260  			}
   261  		case v1.TaintNodeUnschedulable:
   262  			messages[i] = "1 node is unschedulable"
   263  			if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE {
   264  				errCode = proto.StatusCode_STATUSCHECK_NODE_UNSCHEDULABLE
   265  			}
   266  		case v1.TaintNodeNetworkUnavailable:
   267  			messages[i] = "1 node's network not available"
   268  			if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE {
   269  				errCode = proto.StatusCode_STATUSCHECK_NODE_NETWORK_UNAVAILABLE
   270  			}
   271  		}
   272  	}
   273  	return errCode, fmt.Errorf("%s: 0/%d nodes available: %s", reason, len(messages), strings.Join(messages, ", "))
   274  }
   275  
   276  func processPodEvents(e corev1.EventInterface, pod v1.Pod, ps *podStatus) *podStatus {
   277  	updated := ps
   278  	if _, ok := unknownConditionsOrSuccess[ps.ae.ErrCode]; !ok {
   279  		return updated
   280  	}
   281  	log.Entry(context.TODO()).Debugf("Fetching events for pod %q", pod.Name)
   282  	// Get pod events.
   283  	scheme := runtime.NewScheme()
   284  	scheme.AddKnownTypes(v1.SchemeGroupVersion, &pod)
   285  	events, err := e.Search(scheme, &pod)
   286  	if err != nil {
   287  		log.Entry(context.TODO()).Debugf("Could not fetch events for resource %q due to %v", pod.Name, err)
   288  		return updated
   289  	}
   290  	// find the latest event.
   291  	var recentEvent *v1.Event
   292  	for _, e := range events.Items {
   293  		event := e.DeepCopy()
   294  		if recentEvent == nil || recentEvent.LastTimestamp.Before(&event.LastTimestamp) {
   295  			recentEvent = event
   296  		}
   297  	}
   298  	if recentEvent == nil || recentEvent.Type == v1.EventTypeNormal {
   299  		return updated
   300  	}
   301  	switch recentEvent.Reason {
   302  	case failedScheduling:
   303  		updated.updateAE(proto.StatusCode_STATUSCHECK_FAILED_SCHEDULING, recentEvent.Message)
   304  	case unhealthy:
   305  		updated.updateAE(proto.StatusCode_STATUSCHECK_UNHEALTHY, recentEvent.Message)
   306  	default:
   307  		// TODO: Add unique error codes for reasons
   308  		updated.updateAE(
   309  			proto.StatusCode_STATUSCHECK_UNKNOWN_EVENT,
   310  			fmt.Sprintf("%s: %s", recentEvent.Reason, recentEvent.Message),
   311  		)
   312  	}
   313  
   314  	return updated
   315  }
   316  
   317  type podStatus struct {
   318  	name      string
   319  	namespace string
   320  	phase     string
   321  	logs      []string
   322  	ae        proto.ActionableErr
   323  }
   324  
   325  func (p *podStatus) isStable() bool {
   326  	return p.phase == success || (p.phase == running && p.ae.Message == "")
   327  }
   328  
   329  func (p *podStatus) withErrAndLogs(errCode proto.StatusCode, l []string, err error) *podStatus {
   330  	var msg string
   331  	if err != nil {
   332  		msg = err.Error()
   333  	}
   334  	p.updateAE(errCode, msg)
   335  	p.logs = l
   336  	return p
   337  }
   338  
   339  func (p *podStatus) updateAE(errCode proto.StatusCode, msg string) {
   340  	p.ae.ErrCode = errCode
   341  	p.ae.Message = msg
   342  }
   343  
   344  func (p *podStatus) String() string {
   345  	switch {
   346  	case p.isStable():
   347  		return ""
   348  	default:
   349  		if p.ae.Message != "" {
   350  			return p.ae.Message
   351  		}
   352  	}
   353  	return fmt.Sprintf(actionableMessage, p.namespace, p.name)
   354  }
   355  
   356  func extractErrorMessageFromWaitingContainerStatus(po *v1.Pod, c v1.ContainerStatus) (proto.StatusCode, []string, error) {
   357  	// Extract meaning full error out of container statuses.
   358  	switch c.State.Waiting.Reason {
   359  	case podInitializing:
   360  		// container is waiting to run. This could be because one of the init containers is
   361  		// still not completed
   362  		return proto.StatusCode_STATUSCHECK_POD_INITIALIZING, nil, nil
   363  	case containerCreating:
   364  		return proto.StatusCode_STATUSCHECK_CONTAINER_CREATING, nil, fmt.Errorf("creating container %s", c.Name)
   365  	case crashLoopBackOff:
   366  		// TODO, in case of container restarting, return the original failure reason due to which container failed.
   367  		sc, l := getPodLogs(po, c.Name, proto.StatusCode_STATUSCHECK_CONTAINER_RESTARTING)
   368  		return sc, l, fmt.Errorf("container %s is backing off waiting to restart", c.Name)
   369  	case ImagePullErr, ImagePullBackOff, ErrImagePullBackOff:
   370  		return proto.StatusCode_STATUSCHECK_IMAGE_PULL_ERR, nil, fmt.Errorf("container %s is waiting to start: %s can't be pulled", c.Name, c.Image)
   371  	case runContainerError:
   372  		match := runContainerRe.FindStringSubmatch(c.State.Waiting.Message)
   373  		if len(match) != 0 {
   374  			return proto.StatusCode_STATUSCHECK_RUN_CONTAINER_ERR, nil, fmt.Errorf("container %s in error: %s", c.Name, trimSpace(match[3]))
   375  		}
   376  	}
   377  	log.Entry(context.TODO()).Debugf("Unknown waiting reason for container %q: %v", c.Name, c.State)
   378  	return proto.StatusCode_STATUSCHECK_CONTAINER_WAITING_UNKNOWN, nil, fmt.Errorf("container %s in error: %v", c.Name, c.State.Waiting)
   379  }
   380  
   381  func newPodStatus(n string, ns string, p string) *podStatus {
   382  	return &podStatus{
   383  		name:      n,
   384  		namespace: ns,
   385  		phase:     p,
   386  		ae: proto.ActionableErr{
   387  			ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
   388  		},
   389  	}
   390  }
   391  
   392  func trimSpace(msg string) string {
   393  	return strings.Trim(msg, " ")
   394  }
   395  
   396  func getPodLogs(po *v1.Pod, c string, sc proto.StatusCode) (proto.StatusCode, []string) {
   397  	log.Entry(context.TODO()).Debugf("Fetching logs for container %s/%s", po.Name, c)
   398  	logCommand := []string{"kubectl", "logs", po.Name, "-n", po.Namespace, "-c", c}
   399  	logs, err := runCli(logCommand[0], logCommand[1:])
   400  	if err != nil {
   401  		return sc, []string{fmt.Sprintf("Error retrieving logs for pod %s: %s.\nTry `%s`", po.Name, err, strings.Join(logCommand, " "))}
   402  	}
   403  	if strings.Contains(string(logs), execFmtError) {
   404  		sc = proto.StatusCode_STATUSCHECK_CONTAINER_EXEC_ERROR
   405  	}
   406  	output := strings.Split(string(logs), "\n")
   407  	// remove spurious empty lines (empty string or from trailing newline)
   408  	lines := make([]string, 0, len(output))
   409  	for _, s := range output {
   410  		if s == "" {
   411  			continue
   412  		}
   413  		lines = append(lines, fmt.Sprintf("[%s %s] %s", po.Name, c, s))
   414  	}
   415  	return sc, lines
   416  }
   417  
   418  func executeCLI(cmdName string, args []string) ([]byte, error) {
   419  	cmd := exec.Command(cmdName, args...)
   420  	return cmd.CombinedOutput()
   421  }
   422  
   423  func isPodOwnedBy(po v1.Pod, controller metav1.Object) bool {
   424  	if controller == nil {
   425  		return true
   426  	}
   427  	return metav1.IsControlledBy(&po, controller)
   428  }