github.com/GoogleContainerTools/skaffold@v1.39.18/pkg/skaffold/kubernetes/status/resource/deployment.go (about)

     1  /*
     2  Copyright 2019 The Skaffold Authors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package resource
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"regexp"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/GoogleContainerTools/skaffold/pkg/diag"
    28  	"github.com/GoogleContainerTools/skaffold/pkg/diag/validator"
    29  	sErrors "github.com/GoogleContainerTools/skaffold/pkg/skaffold/errors"
    30  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/event"
    31  	eventV2 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/event/v2"
    32  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubectl"
    33  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/output/log"
    34  	"github.com/GoogleContainerTools/skaffold/proto/v1"
    35  	protoV2 "github.com/GoogleContainerTools/skaffold/proto/v2"
    36  )
    37  
    38  const (
    39  	deploymentRolloutSuccess = "successfully rolled out"
    40  	connectionErrMsg         = "Unable to connect to the server"
    41  	killedErrMsg             = "signal: killed"
    42  	defaultPodCheckDeadline  = 30 * time.Second
    43  	tabHeader                = " -"
    44  	tab                      = "  "
    45  	maxLogLines              = 3
    46  )
    47  
    48  // Type represents a kubernetes resource type to health check.
    49  type Type string
    50  
    51  var (
    52  	statefulsetRolloutSuccess = regexp.MustCompile("(roll out|rolling update) complete")
    53  
    54  	msgKubectlKilled     = "kubectl rollout status command interrupted\n"
    55  	MsgKubectlConnection = "kubectl connection error\n"
    56  
    57  	nonRetryContainerErrors = map[proto.StatusCode]struct{}{
    58  		proto.StatusCode_STATUSCHECK_IMAGE_PULL_ERR:       {},
    59  		proto.StatusCode_STATUSCHECK_RUN_CONTAINER_ERR:    {},
    60  		proto.StatusCode_STATUSCHECK_CONTAINER_TERMINATED: {},
    61  		proto.StatusCode_STATUSCHECK_CONTAINER_RESTARTING: {},
    62  	}
    63  
    64  	ResourceTypes = struct {
    65  		StandalonePods  Type
    66  		Deployment      Type
    67  		StatefulSet     Type
    68  		ConfigConnector Type
    69  	}{
    70  		StandalonePods:  "standalone-pods",
    71  		Deployment:      "deployment",
    72  		StatefulSet:     "statefulset",
    73  		ConfigConnector: "config-connector-resource",
    74  	}
    75  )
    76  
    77  type Group map[string]*Resource
    78  
    79  func (r Group) Add(d *Resource) {
    80  	r[d.ID()] = d
    81  }
    82  
    83  func (r Group) Contains(d *Resource) bool {
    84  	_, found := r[d.ID()]
    85  	return found
    86  }
    87  
    88  func (r Group) Reset() {
    89  	for k := range r {
    90  		delete(r, k)
    91  	}
    92  }
    93  
    94  type Resource struct {
    95  	name             string
    96  	namespace        string
    97  	rType            Type
    98  	status           Status
    99  	statusCode       proto.StatusCode
   100  	done             bool
   101  	deadline         time.Duration
   102  	resources        map[string]validator.Resource
   103  	resoureValidator diag.Diagnose
   104  }
   105  
   106  func (r *Resource) ID() string {
   107  	return fmt.Sprintf("%s:%s:%s", r.name, r.namespace, r.rType)
   108  }
   109  
   110  func (r *Resource) Deadline() time.Duration {
   111  	return r.deadline
   112  }
   113  
   114  func (r *Resource) UpdateStatus(ae *proto.ActionableErr) {
   115  	updated := newStatus(ae)
   116  	if r.status.Equal(updated) {
   117  		r.status.changed = false
   118  		return
   119  	}
   120  	r.status = updated
   121  	r.statusCode = updated.ActionableError().ErrCode
   122  	r.status.changed = true
   123  	if ae.ErrCode == proto.StatusCode_STATUSCHECK_SUCCESS || isErrAndNotRetryAble(ae.ErrCode) {
   124  		r.done = true
   125  	}
   126  }
   127  
   128  func NewResource(name string, rType Type, ns string, deadline time.Duration) *Resource {
   129  	return &Resource{
   130  		name:             name,
   131  		namespace:        ns,
   132  		rType:            rType,
   133  		status:           newStatus(&proto.ActionableErr{}),
   134  		deadline:         deadline,
   135  		resoureValidator: diag.New(nil),
   136  	}
   137  }
   138  
   139  func (r *Resource) WithValidator(pd diag.Diagnose) *Resource {
   140  	r.resoureValidator = pd
   141  	return r
   142  }
   143  
   144  func (r *Resource) checkStandalonePodsStatus(ctx context.Context, cfg kubectl.Config) *proto.ActionableErr {
   145  	if len(r.resources) == 0 {
   146  		return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_STANDALONE_PODS_PENDING}
   147  	}
   148  	kubeCtl := kubectl.NewCLI(cfg, "")
   149  	var pendingPods []string
   150  	for _, pod := range r.resources {
   151  		switch pod.Status() {
   152  		case "Failed":
   153  			return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_UNKNOWN, Message: fmt.Sprintf("pod %s failed", pod.Name())}
   154  		case "Running":
   155  			b, _ := kubeCtl.RunOut(ctx, "get", "pod", pod.Name(), "-o", `jsonpath={..status.conditions[?(@.type=="Ready")].status}`, "--namespace", pod.Namespace())
   156  			if ctx.Err() != nil {
   157  				return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_USER_CANCELLED}
   158  			}
   159  			if podReady, _ := strconv.ParseBool(string(b)); !podReady {
   160  				pendingPods = append(pendingPods, pod.Name())
   161  			}
   162  		default:
   163  			pendingPods = append(pendingPods, pod.Name())
   164  		}
   165  	}
   166  	if len(pendingPods) > 0 {
   167  		return &proto.ActionableErr{
   168  			ErrCode: proto.StatusCode_STATUSCHECK_STANDALONE_PODS_PENDING,
   169  			Message: fmt.Sprintf("pods not ready: %v", pendingPods),
   170  		}
   171  	}
   172  	return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS}
   173  }
   174  
   175  func (r *Resource) checkConfigConnectorStatus() *proto.ActionableErr {
   176  	if len(r.resources) == 0 {
   177  		return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_IN_PROGRESS}
   178  	}
   179  	var pendingResources []string
   180  	for _, resource := range r.resources {
   181  		ae := resource.ActionableError()
   182  		if ae == nil {
   183  			continue
   184  		}
   185  		switch ae.ErrCode {
   186  		case proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_FAILED, proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_TERMINATING:
   187  			return ae
   188  		case proto.StatusCode_STATUSCHECK_SUCCESS:
   189  			continue
   190  		default:
   191  			pendingResources = append(pendingResources, resource.Name())
   192  		}
   193  	}
   194  	if len(pendingResources) > 0 {
   195  		return &proto.ActionableErr{
   196  			ErrCode: proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_IN_PROGRESS,
   197  			Message: fmt.Sprintf("config connector resources not ready: %v", pendingResources),
   198  		}
   199  	}
   200  	return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS}
   201  }
   202  
   203  func (r *Resource) checkRolloutStatus(ctx context.Context, cfg kubectl.Config) *proto.ActionableErr {
   204  	kubeCtl := kubectl.NewCLI(cfg, "")
   205  
   206  	b, err := kubeCtl.RunOut(ctx, "rollout", "status", string(r.rType), r.name, "--namespace", r.namespace, "--watch=false")
   207  	if ctx.Err() != nil {
   208  		return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_USER_CANCELLED}
   209  	}
   210  
   211  	details := r.cleanupStatus(string(b))
   212  	return parseKubectlRolloutError(details, r.deadline, err)
   213  }
   214  
   215  func (r *Resource) CheckStatus(ctx context.Context, cfg kubectl.Config) {
   216  	var ae *proto.ActionableErr
   217  	switch r.rType {
   218  	case ResourceTypes.StandalonePods:
   219  		ae = r.checkStandalonePodsStatus(ctx, cfg)
   220  	case ResourceTypes.ConfigConnector:
   221  		ae = r.checkConfigConnectorStatus()
   222  	default:
   223  		ae = r.checkRolloutStatus(ctx, cfg)
   224  	}
   225  
   226  	r.UpdateStatus(ae)
   227  	// send event update in check status.
   228  	// if deployment is successfully rolled out, send pod success event to make sure
   229  	// all pod are marked as success in V2
   230  	// See https://github.com/GoogleCloudPlatform/cloud-code-vscode-internal/issues/5277
   231  	if ae.ErrCode == proto.StatusCode_STATUSCHECK_SUCCESS {
   232  		for _, pod := range r.resources {
   233  			eventV2.ResourceStatusCheckEventCompletedMessage(
   234  				pod.String(),
   235  				fmt.Sprintf("%s %s: running.\n", tabHeader, pod.String()),
   236  				&protoV2.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS},
   237  			)
   238  		}
   239  		return
   240  	}
   241  	if err := r.fetchPods(ctx); err != nil {
   242  		log.Entry(ctx).Debugf("pod statuses could not be fetched this time due to %s", err)
   243  	}
   244  }
   245  
   246  func (r *Resource) String() string {
   247  	switch r.rType {
   248  	case ResourceTypes.StandalonePods:
   249  		return "pods"
   250  	default:
   251  		if r.namespace == "default" {
   252  			return fmt.Sprintf("%s/%s", r.rType, r.name)
   253  		}
   254  
   255  		return fmt.Sprintf("%s:%s/%s", r.namespace, r.rType, r.name)
   256  	}
   257  }
   258  
   259  func (r *Resource) Name() string {
   260  	return r.name
   261  }
   262  
   263  func (r *Resource) Status() Status {
   264  	return r.status
   265  }
   266  
   267  func (r *Resource) IsStatusCheckCompleteOrCancelled() bool {
   268  	return r.done || r.statusCode == proto.StatusCode_STATUSCHECK_USER_CANCELLED
   269  }
   270  
   271  func (r *Resource) StatusMessage() string {
   272  	for _, p := range r.resources {
   273  		if s := p.ActionableError(); s.ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS {
   274  			return fmt.Sprintf("%s\n", s.Message)
   275  		}
   276  	}
   277  	return r.status.String()
   278  }
   279  
   280  func (r *Resource) MarkComplete() {
   281  	r.done = true
   282  }
   283  
   284  // ReportSinceLastUpdated returns a string representing rollout status along with tab header
   285  // e.g.
   286  //   - testNs:deployment/leeroy-app: waiting for rollout to complete. (1/2) pending
   287  //   - testNs:pod/leeroy-app-xvbg : error pulling container image
   288  func (r *Resource) ReportSinceLastUpdated(isMuted bool) string {
   289  	if r.status.reported && !r.status.changed {
   290  		return ""
   291  	}
   292  	r.status.reported = true
   293  	if r.status.String() == "" {
   294  		return ""
   295  	}
   296  	var result strings.Builder
   297  	// Pod container statuses can be empty.
   298  	// This can happen when
   299  	// 1. No pods have been scheduled for the rollout
   300  	// 2. All containers are in running phase with no errors.
   301  	// In such case, avoid printing any status update for the rollout.
   302  	for _, p := range r.resources {
   303  		if s := p.ActionableError().Message; s != "" {
   304  			result.WriteString(fmt.Sprintf("%s %s %s: %s\n", tab, tabHeader, p, s))
   305  			// if logs are muted, write container logs to file and last 3 lines to
   306  			// result.
   307  			out, writeTrimLines, err := withLogFile(p.Name(), &result, p.Logs(), isMuted)
   308  			if err != nil {
   309  				log.Entry(context.TODO()).Debugf("could not create log file %v", err)
   310  			}
   311  			trimLines := []string{}
   312  			for i, l := range p.Logs() {
   313  				formattedLine := fmt.Sprintf("%s %s > %s\n", tab, tab, strings.TrimSuffix(l, "\n"))
   314  				if isMuted && i >= len(p.Logs())-maxLogLines {
   315  					trimLines = append(trimLines, formattedLine)
   316  				}
   317  				out.Write([]byte(formattedLine))
   318  			}
   319  			writeTrimLines(trimLines)
   320  		}
   321  	}
   322  	return fmt.Sprintf("%s %s: %s%s", tabHeader, r, r.StatusMessage(), result.String())
   323  }
   324  
   325  func (r *Resource) cleanupStatus(msg string) string {
   326  	switch r.rType {
   327  	case ResourceTypes.Deployment:
   328  		clean := strings.ReplaceAll(msg, `deployment "`+r.Name()+`" `, "")
   329  		if len(clean) > 0 {
   330  			clean = strings.ToLower(clean[0:1]) + clean[1:]
   331  		}
   332  		return clean
   333  	default:
   334  		return msg
   335  	}
   336  }
   337  
   338  // parses out connection error
   339  // $kubectl logs somePod -f
   340  // Unable to connect to the server: dial tcp x.x.x.x:443: connect: network is unreachable
   341  
   342  // Parses out errors when kubectl was killed on client side
   343  // $kubectl logs testPod  -f
   344  // 2020/06/18 17:28:31 service is running
   345  // Killed: 9
   346  func parseKubectlRolloutError(details string, deadline time.Duration, err error) *proto.ActionableErr {
   347  	switch {
   348  	// deployment rollouts have success messages like `deployment "skaffold-foo" successfully rolled out`
   349  	case err == nil && strings.Contains(details, deploymentRolloutSuccess):
   350  		return &proto.ActionableErr{
   351  			ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
   352  			Message: details,
   353  		}
   354  	// statefulset rollouts have success messages like `statefulset rolling update complete 2 pods at revision skaffold-foo`
   355  	case err == nil && statefulsetRolloutSuccess.MatchString(details):
   356  		return &proto.ActionableErr{
   357  			ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS,
   358  			Message: details,
   359  		}
   360  	case err == nil:
   361  		return &proto.ActionableErr{
   362  			ErrCode: proto.StatusCode_STATUSCHECK_DEPLOYMENT_ROLLOUT_PENDING,
   363  			Message: details,
   364  		}
   365  	case strings.Contains(err.Error(), connectionErrMsg):
   366  		return &proto.ActionableErr{
   367  			ErrCode: proto.StatusCode_STATUSCHECK_KUBECTL_CONNECTION_ERR,
   368  			Message: MsgKubectlConnection,
   369  		}
   370  	case strings.Contains(err.Error(), killedErrMsg):
   371  		return &proto.ActionableErr{
   372  			ErrCode: proto.StatusCode_STATUSCHECK_KUBECTL_PID_KILLED,
   373  			Message: fmt.Sprintf("received Ctrl-C or deployments could not stabilize within %v: %s", deadline, msgKubectlKilled),
   374  		}
   375  	default:
   376  		return &proto.ActionableErr{
   377  			ErrCode: proto.StatusCode_STATUSCHECK_UNKNOWN,
   378  			Message: err.Error(),
   379  		}
   380  	}
   381  }
   382  
   383  func isErrAndNotRetryAble(statusCode proto.StatusCode) bool {
   384  	return statusCode != proto.StatusCode_STATUSCHECK_KUBECTL_CONNECTION_ERR &&
   385  		statusCode != proto.StatusCode_STATUSCHECK_DEPLOYMENT_ROLLOUT_PENDING &&
   386  		statusCode != proto.StatusCode_STATUSCHECK_STANDALONE_PODS_PENDING &&
   387  		statusCode != proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_IN_PROGRESS
   388  }
   389  
   390  // HasEncounteredUnrecoverableError goes through all pod statuses and return true
   391  // if any cannot be recovered
   392  func (r *Resource) HasEncounteredUnrecoverableError() bool {
   393  	for _, p := range r.resources {
   394  		if _, ok := nonRetryContainerErrors[p.ActionableError().ErrCode]; ok {
   395  			return true
   396  		}
   397  	}
   398  	return false
   399  }
   400  
   401  func (r *Resource) fetchPods(ctx context.Context) error {
   402  	timeoutContext, cancel := context.WithTimeout(ctx, defaultPodCheckDeadline)
   403  	defer cancel()
   404  	pods, err := r.resoureValidator.Run(timeoutContext)
   405  	if err != nil {
   406  		return err
   407  	}
   408  
   409  	newResources := map[string]validator.Resource{}
   410  	r.status.changed = false
   411  	for _, p := range pods {
   412  		originalPod, found := r.resources[p.String()]
   413  		if !found || originalPod.StatusUpdated(p) {
   414  			r.status.changed = true
   415  			prefix := fmt.Sprintf("%s %s:", tabHeader, p.String())
   416  			if p.ActionableError().ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS &&
   417  				p.ActionableError().Message != "" {
   418  				event.ResourceStatusCheckEventUpdated(p.String(), p.ActionableError())
   419  				eventV2.ResourceStatusCheckEventUpdatedMessage(
   420  					p.String(),
   421  					prefix,
   422  					sErrors.V2fromV1(p.ActionableError()))
   423  			}
   424  		}
   425  		newResources[p.String()] = p
   426  	}
   427  	r.resources = newResources
   428  	return nil
   429  }
   430  
   431  // StatusCode returns the rollout status code if the status check is cancelled
   432  // or if no pod data exists for this rollout.
   433  // If pods are fetched, this function returns the error code a pod container encountered.
   434  func (r *Resource) StatusCode() proto.StatusCode {
   435  	// do not process pod status codes
   436  	// 1) the user aborted the run or
   437  	// 2) if another rollout failed which cancelled this deployment status check
   438  	// 3) the deployment is successful. In case of successful rollouts, the code doesn't fetch the updated pod statuses.
   439  	if r.statusCode == proto.StatusCode_STATUSCHECK_USER_CANCELLED || r.statusCode == proto.StatusCode_STATUSCHECK_SUCCESS {
   440  		return r.statusCode
   441  	}
   442  	for _, p := range r.resources {
   443  		if s := p.ActionableError().ErrCode; s != proto.StatusCode_STATUSCHECK_SUCCESS {
   444  			return s
   445  		}
   446  	}
   447  	return r.statusCode
   448  }
   449  
   450  func (r *Resource) WithPodStatuses(scs []proto.StatusCode) *Resource {
   451  	r.resources = map[string]validator.Resource{}
   452  	for i, s := range scs {
   453  		name := fmt.Sprintf("%s-%d", r.name, i)
   454  		r.resources[name] = validator.NewResource("test", "pod", "foo", validator.Status("failed"),
   455  			&proto.ActionableErr{Message: "pod failed", ErrCode: s}, nil)
   456  	}
   457  	return r
   458  }