github.com/GoogleContainerTools/skaffold@v1.39.18/pkg/skaffold/kubernetes/status/status_check.go (about)

     1  /*
     2  Copyright 2019 The Skaffold Authors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package status
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"strings"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"golang.org/x/sync/singleflight"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/client-go/dynamic"
    31  	"k8s.io/client-go/kubernetes"
    32  
    33  	"github.com/GoogleContainerTools/skaffold/pkg/diag"
    34  	"github.com/GoogleContainerTools/skaffold/pkg/diag/validator"
    35  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/config"
    36  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/constants"
    37  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/deploy/label"
    38  	sErrors "github.com/GoogleContainerTools/skaffold/pkg/skaffold/errors"
    39  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/event"
    40  	eventV2 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/event/v2"
    41  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/instrumentation"
    42  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubectl"
    43  	kubernetesclient "github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubernetes/client"
    44  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubernetes/manifest"
    45  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubernetes/status/resource"
    46  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/output"
    47  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/output/log"
    48  	"github.com/GoogleContainerTools/skaffold/pkg/skaffold/status"
    49  	timeutil "github.com/GoogleContainerTools/skaffold/pkg/skaffold/util/time"
    50  	"github.com/GoogleContainerTools/skaffold/proto/v1"
    51  )
    52  
    53  var (
    54  	// DefaultStatusCheckDeadline is the default timeout for resource status checks
    55  	DefaultStatusCheckDeadline = 10 * time.Minute
    56  
    57  	// Poll period for checking set to 1 second
    58  	defaultPollPeriodInMilliseconds = 1000
    59  
    60  	// report resource status for pending resources 5 seconds.
    61  	reportStatusTime = 5 * time.Second
    62  )
    63  
    64  const (
    65  	tabHeader             = " -"
    66  	kubernetesMaxDeadline = 600
    67  )
    68  
    69  type counter struct {
    70  	total     int
    71  	pending   int32
    72  	failed    int32
    73  	cancelled int32
    74  }
    75  
    76  type Config interface {
    77  	kubectl.Config
    78  
    79  	StatusCheckDeadlineSeconds() int
    80  	Muted() config.Muted
    81  	StatusCheck() *bool
    82  }
    83  
    84  // Monitor runs status checks for selected resources
    85  type Monitor interface {
    86  	status.Monitor
    87  	RegisterDeployManifests(manifest.ManifestList)
    88  }
    89  
    90  type monitor struct {
    91  	cfg             Config
    92  	labeller        *label.DefaultLabeller
    93  	deadlineSeconds int
    94  	muteLogs        bool
    95  	seenResources   resource.Group
    96  	singleRun       singleflight.Group
    97  	namespaces      *[]string
    98  	kubeContext     string
    99  	manifests       manifest.ManifestList
   100  }
   101  
   102  // NewStatusMonitor returns a status monitor which runs checks on selected resource rollouts.
   103  // Currently implemented for deployments and statefulsets.
   104  func NewStatusMonitor(cfg Config, labeller *label.DefaultLabeller, namespaces *[]string) Monitor {
   105  	return &monitor{
   106  		muteLogs:        cfg.Muted().MuteStatusCheck(),
   107  		cfg:             cfg,
   108  		labeller:        labeller,
   109  		deadlineSeconds: cfg.StatusCheckDeadlineSeconds(),
   110  		seenResources:   make(resource.Group),
   111  		singleRun:       singleflight.Group{},
   112  		namespaces:      namespaces,
   113  		kubeContext:     cfg.GetKubeContext(),
   114  		manifests:       make(manifest.ManifestList, 0),
   115  	}
   116  }
   117  
   118  func (s *monitor) RegisterDeployManifests(manifests manifest.ManifestList) {
   119  	if len(s.manifests) == 0 {
   120  		s.manifests = manifests
   121  		return
   122  	}
   123  	for _, m := range manifests {
   124  		s.manifests.Append(m)
   125  	}
   126  }
   127  
   128  // Check runs the status checks on selected resource rollouts in current skaffold dev iteration.
   129  // Currently implemented for deployments.
   130  func (s *monitor) Check(ctx context.Context, out io.Writer) error {
   131  	_, err, _ := s.singleRun.Do(s.labeller.GetRunID(), func() (interface{}, error) {
   132  		return struct{}{}, s.check(ctx, out)
   133  	})
   134  	return err
   135  }
   136  
   137  func (s *monitor) check(ctx context.Context, out io.Writer) error {
   138  	event.StatusCheckEventStarted()
   139  	ctx, endTrace := instrumentation.StartTrace(ctx, "performStatusCheck_WaitForDeploymentToStabilize")
   140  	defer endTrace()
   141  
   142  	start := time.Now()
   143  	output.Default.Fprintln(out, "Waiting for deployments to stabilize...")
   144  
   145  	errCode, err := s.statusCheck(ctx, out)
   146  	event.StatusCheckEventEnded(errCode, err)
   147  	if err != nil {
   148  		return err
   149  	}
   150  
   151  	output.Default.Fprintln(out, "Deployments stabilized in", timeutil.Humanize(time.Since(start)))
   152  	return nil
   153  }
   154  
   155  func (s *monitor) Reset() {
   156  	s.seenResources.Reset()
   157  }
   158  
   159  func (s *monitor) statusCheck(ctx context.Context, out io.Writer) (proto.StatusCode, error) {
   160  	client, err := kubernetesclient.Client(s.kubeContext)
   161  	if err != nil {
   162  		return proto.StatusCode_STATUSCHECK_KUBECTL_CLIENT_FETCH_ERR, fmt.Errorf("getting Kubernetes client: %w", err)
   163  	}
   164  	dynClient, err := kubernetesclient.DynamicClient(s.kubeContext)
   165  	if err != nil {
   166  		return proto.StatusCode_STATUSCHECK_KUBECTL_CLIENT_FETCH_ERR, fmt.Errorf("getting Kubernetes client: %w", err)
   167  	}
   168  	resources := make([]*resource.Resource, 0)
   169  	for _, n := range *s.namespaces {
   170  		newDeployments, err := getDeployments(ctx, client, n, s.labeller, getDeadline(s.deadlineSeconds))
   171  		if err != nil {
   172  			return proto.StatusCode_STATUSCHECK_DEPLOYMENT_FETCH_ERR, fmt.Errorf("could not fetch deployments: %w", err)
   173  		}
   174  		for _, d := range newDeployments {
   175  			if s.seenResources.Contains(d) {
   176  				continue
   177  			}
   178  			resources = append(resources, d)
   179  			s.seenResources.Add(d)
   180  		}
   181  
   182  		newStatefulSets, err := getStatefulSets(ctx, client, n, s.labeller, getDeadline(s.deadlineSeconds))
   183  		if err != nil {
   184  			return proto.StatusCode_STATUSCHECK_STATEFULSET_FETCH_ERR, fmt.Errorf("could not fetch statefulsets: %w", err)
   185  		}
   186  		for _, d := range newStatefulSets {
   187  			if s.seenResources.Contains(d) {
   188  				continue
   189  			}
   190  			resources = append(resources, d)
   191  			s.seenResources.Add(d)
   192  		}
   193  
   194  		newStandalonePods, err := getStandalonePods(ctx, client, n, s.labeller, getDeadline((s.deadlineSeconds)))
   195  		if err != nil {
   196  			return proto.StatusCode_STATUSCHECK_STANDALONE_PODS_FETCH_ERR, fmt.Errorf("could not fetch standalone pods: %w", err)
   197  		}
   198  		for _, pods := range newStandalonePods {
   199  			if s.seenResources.Contains(pods) {
   200  				continue
   201  			}
   202  			resources = append(resources, pods)
   203  			s.seenResources.Add(pods)
   204  		}
   205  
   206  		newConfigConnectorResources, err := getConfigConnectorResources(client, dynClient, s.manifests, n, s.labeller, getDeadline(s.deadlineSeconds))
   207  		if err != nil {
   208  			return proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_RESOURCES_FETCH_ERR, fmt.Errorf("could not fetch config connector resources: %w", err)
   209  		}
   210  		for _, d := range newConfigConnectorResources {
   211  			if s.seenResources.Contains(d) {
   212  				continue
   213  			}
   214  			resources = append(resources, d)
   215  			s.seenResources.Add(d)
   216  		}
   217  	}
   218  
   219  	var wg sync.WaitGroup
   220  	c := newCounter(len(resources))
   221  
   222  	ctx, cancel := context.WithCancel(ctx)
   223  	defer cancel()
   224  	var exitStatusOnce sync.Once
   225  	var exitStatus proto.StatusCode
   226  
   227  	for _, d := range resources {
   228  		wg.Add(1)
   229  		go func(r *resource.Resource) {
   230  			defer wg.Done()
   231  			// keep updating the resource status until it fails/succeeds/times out/cancelled.
   232  			pollResourceStatus(ctx, s.cfg, r)
   233  			rcCopy, failed := c.markProcessed(ctx, r.StatusCode())
   234  			s.printStatusCheckSummary(out, r, rcCopy)
   235  			// if a resource fails, cancel status checks for all resources to fail fast
   236  			// and capture the first failed exit code.
   237  			if failed {
   238  				exitStatusOnce.Do(func() {
   239  					exitStatus = r.StatusCode()
   240  				})
   241  				cancel()
   242  			}
   243  		}(d)
   244  	}
   245  
   246  	// Retrieve pending resource statuses
   247  	go func() {
   248  		s.printResourceStatus(ctx, out, resources)
   249  	}()
   250  
   251  	// Wait for all deployment statuses to be fetched
   252  	wg.Wait()
   253  	return getSkaffoldDeployStatus(ctx, c, exitStatus)
   254  }
   255  
   256  func getStandalonePods(ctx context.Context, client kubernetes.Interface, ns string, l *label.DefaultLabeller, deadlineDuration time.Duration) ([]*resource.Resource, error) {
   257  	var result []*resource.Resource
   258  	selector := validator.NewStandalonePodsSelector(client)
   259  	pods, err := selector.Select(ctx, ns, metav1.ListOptions{
   260  		LabelSelector: l.RunIDSelector(),
   261  	})
   262  	if err != nil {
   263  		return nil, fmt.Errorf("could not fetch standalone pods: %w", err)
   264  	}
   265  	if len(pods) == 0 {
   266  		return result, nil
   267  	}
   268  	pd := diag.New([]string{ns}).
   269  		WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]).
   270  		WithValidators([]validator.Validator{validator.NewPodValidator(client, selector)})
   271  	result = append(result, resource.NewResource(string(resource.ResourceTypes.StandalonePods), resource.ResourceTypes.StandalonePods, ns, deadlineDuration).WithValidator(pd))
   272  
   273  	return result, nil
   274  }
   275  
   276  func getConfigConnectorResources(client kubernetes.Interface, dynClient dynamic.Interface, m manifest.ManifestList, ns string, l *label.DefaultLabeller, deadlineDuration time.Duration) ([]*resource.Resource, error) {
   277  	var result []*resource.Resource
   278  	uRes, err := m.SelectResources(manifest.ConfigConnectorResourceSelector...)
   279  	if err != nil {
   280  		return nil, fmt.Errorf("could not fetch config connector resources: %w", err)
   281  	}
   282  	for _, r := range uRes {
   283  		resName := r.GroupVersionKind().String()
   284  		if r.GetName() != "" {
   285  			resName = fmt.Sprintf("%s, Name=%s", resName, r.GetName())
   286  		}
   287  		pd := diag.New([]string{ns}).
   288  			WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]).
   289  			WithValidators([]validator.Validator{validator.NewConfigConnectorValidator(client, dynClient, r.GroupVersionKind())})
   290  		result = append(result, resource.NewResource(resName, resource.ResourceTypes.ConfigConnector, ns, deadlineDuration).WithValidator(pd))
   291  	}
   292  
   293  	return result, nil
   294  }
   295  
   296  func getDeployments(ctx context.Context, client kubernetes.Interface, ns string, l *label.DefaultLabeller, deadlineDuration time.Duration) ([]*resource.Resource, error) {
   297  	deps, err := client.AppsV1().Deployments(ns).List(ctx, metav1.ListOptions{
   298  		LabelSelector: l.RunIDSelector(),
   299  	})
   300  	if err != nil {
   301  		return nil, fmt.Errorf("could not fetch deployments: %w", err)
   302  	}
   303  
   304  	resources := make([]*resource.Resource, len(deps.Items))
   305  	for i, d := range deps.Items {
   306  		var deadline time.Duration
   307  		if d.Spec.ProgressDeadlineSeconds == nil || *d.Spec.ProgressDeadlineSeconds == kubernetesMaxDeadline {
   308  			deadline = deadlineDuration
   309  		} else {
   310  			deadline = time.Duration(*d.Spec.ProgressDeadlineSeconds) * time.Second
   311  		}
   312  
   313  		pd := diag.New([]string{d.Namespace}).
   314  			WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]).
   315  			WithValidators([]validator.Validator{validator.NewPodValidator(client, validator.NewDeploymentPodsSelector(client, d))})
   316  
   317  		for k, v := range d.Spec.Template.Labels {
   318  			pd = pd.WithLabel(k, v)
   319  		}
   320  
   321  		resources[i] = resource.NewResource(d.Name, resource.ResourceTypes.Deployment, d.Namespace, deadline).WithValidator(pd)
   322  	}
   323  	return resources, nil
   324  }
   325  
   326  func getStatefulSets(ctx context.Context, client kubernetes.Interface, ns string, l *label.DefaultLabeller, deadline time.Duration) ([]*resource.Resource, error) {
   327  	sets, err := client.AppsV1().StatefulSets(ns).List(ctx, metav1.ListOptions{
   328  		LabelSelector: l.RunIDSelector(),
   329  	})
   330  	if err != nil {
   331  		return nil, fmt.Errorf("could not fetch stateful sets: %w", err)
   332  	}
   333  
   334  	resources := make([]*resource.Resource, len(sets.Items))
   335  	for i, ss := range sets.Items {
   336  		pd := diag.New([]string{ss.Namespace}).
   337  			WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]).
   338  			WithValidators([]validator.Validator{validator.NewPodValidator(client, validator.NewStatefulSetPodsSelector(client, ss))})
   339  
   340  		for k, v := range ss.Spec.Template.Labels {
   341  			pd = pd.WithLabel(k, v)
   342  		}
   343  
   344  		resources[i] = resource.NewResource(ss.Name, resource.ResourceTypes.StatefulSet, ss.Namespace, deadline).WithValidator(pd)
   345  	}
   346  	return resources, nil
   347  }
   348  
   349  func pollResourceStatus(ctx context.Context, cfg kubectl.Config, r *resource.Resource) {
   350  	pollDuration := time.Duration(defaultPollPeriodInMilliseconds) * time.Millisecond
   351  	ticker := time.NewTicker(pollDuration)
   352  	defer ticker.Stop()
   353  	// Add poll duration to account for one last attempt after progressDeadlineSeconds.
   354  	timeoutContext, cancel := context.WithTimeout(ctx, r.Deadline()+pollDuration)
   355  	log.Entry(ctx).Debugf("checking status %s", r)
   356  	defer cancel()
   357  	for {
   358  		select {
   359  		case <-timeoutContext.Done():
   360  			switch c := timeoutContext.Err(); c {
   361  			case context.Canceled:
   362  				r.UpdateStatus(&proto.ActionableErr{
   363  					ErrCode: proto.StatusCode_STATUSCHECK_USER_CANCELLED,
   364  					Message: "check cancelled\n",
   365  				})
   366  			case context.DeadlineExceeded:
   367  				r.UpdateStatus(&proto.ActionableErr{
   368  					ErrCode: proto.StatusCode_STATUSCHECK_DEADLINE_EXCEEDED,
   369  					Message: fmt.Sprintf("could not stabilize within %v\n", r.Deadline()),
   370  				})
   371  			}
   372  			return
   373  		case <-ticker.C:
   374  			r.CheckStatus(timeoutContext, cfg)
   375  			if r.IsStatusCheckCompleteOrCancelled() {
   376  				return
   377  			}
   378  			// Fail immediately if any pod container errors cannot be recovered.
   379  			// StatusCheck is not interruptable.
   380  			// As any changes to build or deploy dependencies are not triggered, exit
   381  			// immediately rather than waiting for for statusCheckDeadlineSeconds
   382  			// TODO: https://github.com/GoogleContainerTools/skaffold/pull/4591
   383  			if r.HasEncounteredUnrecoverableError() {
   384  				r.MarkComplete()
   385  				return
   386  			}
   387  		}
   388  	}
   389  }
   390  
   391  func getSkaffoldDeployStatus(ctx context.Context, c *counter, sc proto.StatusCode) (proto.StatusCode, error) {
   392  	if c.total == int(c.cancelled) && c.total > 0 {
   393  		err := fmt.Errorf("%d/%d deployment(s) status check cancelled", c.cancelled, c.total)
   394  		return proto.StatusCode_STATUSCHECK_USER_CANCELLED, err
   395  	}
   396  	// return success if no failures find.
   397  	if c.failed == 0 {
   398  		return proto.StatusCode_STATUSCHECK_SUCCESS, nil
   399  	}
   400  	// construct an error message and return appropriate error code
   401  	err := fmt.Errorf("%d/%d deployment(s) failed", c.failed, c.total)
   402  	if sc == proto.StatusCode_STATUSCHECK_SUCCESS || sc == 0 {
   403  		log.Entry(ctx).Debugf("found statuscode %s. setting skaffold deploy status to STATUSCHECK_INTERNAL_ERROR.", sc)
   404  		return proto.StatusCode_STATUSCHECK_INTERNAL_ERROR, err
   405  	}
   406  	log.Entry(ctx).Debugf("setting skaffold deploy status to %s.", sc)
   407  	return sc, err
   408  }
   409  
   410  func getDeadline(d int) time.Duration {
   411  	if d > 0 {
   412  		return time.Duration(d) * time.Second
   413  	}
   414  	return DefaultStatusCheckDeadline
   415  }
   416  
   417  func (s *monitor) printStatusCheckSummary(out io.Writer, r *resource.Resource, c counter) {
   418  	ae := r.Status().ActionableError()
   419  	if r.StatusCode() == proto.StatusCode_STATUSCHECK_USER_CANCELLED {
   420  		// Don't print the status summary if the user ctrl-C or
   421  		// another deployment failed
   422  		return
   423  	}
   424  	event.ResourceStatusCheckEventCompleted(r.String(), ae)
   425  	eventV2.ResourceStatusCheckEventCompleted(r.String(), sErrors.V2fromV1(ae))
   426  	out, _ = output.WithEventContext(context.Background(), out, constants.Deploy, r.String())
   427  	status := fmt.Sprintf("%s %s", tabHeader, r)
   428  	if ae.ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS {
   429  		if str := r.ReportSinceLastUpdated(s.muteLogs); str != "" {
   430  			fmt.Fprintln(out, trimNewLine(str))
   431  		}
   432  		status = fmt.Sprintf("%s failed. Error: %s.",
   433  			status,
   434  			trimNewLine(r.StatusMessage()),
   435  		)
   436  	} else {
   437  		status = fmt.Sprintf("%s is ready.%s", status, getPendingMessage(c.pending, c.total))
   438  	}
   439  
   440  	fmt.Fprintln(out, status)
   441  }
   442  
   443  // printResourceStatus prints resource statuses until all status check are completed or context is cancelled.
   444  func (s *monitor) printResourceStatus(ctx context.Context, out io.Writer, resources []*resource.Resource) {
   445  	ticker := time.NewTicker(reportStatusTime)
   446  	defer ticker.Stop()
   447  	for {
   448  		var allDone bool
   449  		select {
   450  		case <-ctx.Done():
   451  			return
   452  		case <-ticker.C:
   453  			allDone = s.printStatus(resources, out)
   454  		}
   455  		if allDone {
   456  			return
   457  		}
   458  	}
   459  }
   460  
   461  func (s *monitor) printStatus(resources []*resource.Resource, out io.Writer) bool {
   462  	allDone := true
   463  	for _, r := range resources {
   464  		if r.IsStatusCheckCompleteOrCancelled() {
   465  			continue
   466  		}
   467  		allDone = false
   468  		if str := r.ReportSinceLastUpdated(s.muteLogs); str != "" {
   469  			ae := r.Status().ActionableError()
   470  			event.ResourceStatusCheckEventUpdated(r.String(), ae)
   471  			eventV2.ResourceStatusCheckEventUpdated(r.String(), sErrors.V2fromV1(ae))
   472  			out, _ := output.WithEventContext(context.Background(), out, constants.Deploy, r.String())
   473  			fmt.Fprintln(out, trimNewLine(str))
   474  		}
   475  	}
   476  	return allDone
   477  }
   478  
   479  func getPendingMessage(pending int32, total int) string {
   480  	if pending > 0 {
   481  		return fmt.Sprintf(" [%d/%d deployment(s) still pending]", pending, total)
   482  	}
   483  	return ""
   484  }
   485  
   486  func trimNewLine(msg string) string {
   487  	return strings.TrimSuffix(msg, "\n")
   488  }
   489  
   490  func newCounter(i int) *counter {
   491  	return &counter{
   492  		total:   i,
   493  		pending: int32(i),
   494  	}
   495  }
   496  
   497  func (c *counter) markProcessed(ctx context.Context, sc proto.StatusCode) (counter, bool) {
   498  	atomic.AddInt32(&c.pending, -1)
   499  	if ctx.Err() == context.Canceled {
   500  		log.Entry(ctx).Debug("marking resource status check cancelled", sc)
   501  		atomic.AddInt32(&c.cancelled, 1)
   502  		return c.copy(), false
   503  	} else if sc == proto.StatusCode_STATUSCHECK_SUCCESS {
   504  		return c.copy(), false
   505  	}
   506  	log.Entry(ctx).Debugf("marking resource failed due to error code %s", sc)
   507  	atomic.AddInt32(&c.failed, 1)
   508  	return c.copy(), true
   509  }
   510  
   511  func (c *counter) copy() counter {
   512  	return counter{
   513  		total:     c.total,
   514  		pending:   c.pending,
   515  		failed:    c.failed,
   516  		cancelled: c.cancelled,
   517  	}
   518  }
   519  
   520  type NoopMonitor struct {
   521  	status.NoopMonitor
   522  }
   523  
   524  func (n *NoopMonitor) RegisterDeployManifests(manifest.ManifestList) {}