github.com/GoogleContainerTools/skaffold@v1.39.18/pkg/skaffold/kubernetes/status/status_check.go (about) 1 /* 2 Copyright 2019 The Skaffold Authors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package status 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "strings" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "golang.org/x/sync/singleflight" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/client-go/dynamic" 31 "k8s.io/client-go/kubernetes" 32 33 "github.com/GoogleContainerTools/skaffold/pkg/diag" 34 "github.com/GoogleContainerTools/skaffold/pkg/diag/validator" 35 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/config" 36 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/constants" 37 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/deploy/label" 38 sErrors "github.com/GoogleContainerTools/skaffold/pkg/skaffold/errors" 39 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/event" 40 eventV2 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/event/v2" 41 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/instrumentation" 42 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubectl" 43 kubernetesclient "github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubernetes/client" 44 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubernetes/manifest" 45 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubernetes/status/resource" 46 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/output" 47 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/output/log" 48 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/status" 49 timeutil "github.com/GoogleContainerTools/skaffold/pkg/skaffold/util/time" 50 "github.com/GoogleContainerTools/skaffold/proto/v1" 51 ) 52 53 var ( 54 // DefaultStatusCheckDeadline is the default timeout for resource status checks 55 DefaultStatusCheckDeadline = 10 * time.Minute 56 57 // Poll period for checking set to 1 second 58 defaultPollPeriodInMilliseconds = 1000 59 60 // report resource status for pending resources 5 seconds. 61 reportStatusTime = 5 * time.Second 62 ) 63 64 const ( 65 tabHeader = " -" 66 kubernetesMaxDeadline = 600 67 ) 68 69 type counter struct { 70 total int 71 pending int32 72 failed int32 73 cancelled int32 74 } 75 76 type Config interface { 77 kubectl.Config 78 79 StatusCheckDeadlineSeconds() int 80 Muted() config.Muted 81 StatusCheck() *bool 82 } 83 84 // Monitor runs status checks for selected resources 85 type Monitor interface { 86 status.Monitor 87 RegisterDeployManifests(manifest.ManifestList) 88 } 89 90 type monitor struct { 91 cfg Config 92 labeller *label.DefaultLabeller 93 deadlineSeconds int 94 muteLogs bool 95 seenResources resource.Group 96 singleRun singleflight.Group 97 namespaces *[]string 98 kubeContext string 99 manifests manifest.ManifestList 100 } 101 102 // NewStatusMonitor returns a status monitor which runs checks on selected resource rollouts. 103 // Currently implemented for deployments and statefulsets. 104 func NewStatusMonitor(cfg Config, labeller *label.DefaultLabeller, namespaces *[]string) Monitor { 105 return &monitor{ 106 muteLogs: cfg.Muted().MuteStatusCheck(), 107 cfg: cfg, 108 labeller: labeller, 109 deadlineSeconds: cfg.StatusCheckDeadlineSeconds(), 110 seenResources: make(resource.Group), 111 singleRun: singleflight.Group{}, 112 namespaces: namespaces, 113 kubeContext: cfg.GetKubeContext(), 114 manifests: make(manifest.ManifestList, 0), 115 } 116 } 117 118 func (s *monitor) RegisterDeployManifests(manifests manifest.ManifestList) { 119 if len(s.manifests) == 0 { 120 s.manifests = manifests 121 return 122 } 123 for _, m := range manifests { 124 s.manifests.Append(m) 125 } 126 } 127 128 // Check runs the status checks on selected resource rollouts in current skaffold dev iteration. 129 // Currently implemented for deployments. 130 func (s *monitor) Check(ctx context.Context, out io.Writer) error { 131 _, err, _ := s.singleRun.Do(s.labeller.GetRunID(), func() (interface{}, error) { 132 return struct{}{}, s.check(ctx, out) 133 }) 134 return err 135 } 136 137 func (s *monitor) check(ctx context.Context, out io.Writer) error { 138 event.StatusCheckEventStarted() 139 ctx, endTrace := instrumentation.StartTrace(ctx, "performStatusCheck_WaitForDeploymentToStabilize") 140 defer endTrace() 141 142 start := time.Now() 143 output.Default.Fprintln(out, "Waiting for deployments to stabilize...") 144 145 errCode, err := s.statusCheck(ctx, out) 146 event.StatusCheckEventEnded(errCode, err) 147 if err != nil { 148 return err 149 } 150 151 output.Default.Fprintln(out, "Deployments stabilized in", timeutil.Humanize(time.Since(start))) 152 return nil 153 } 154 155 func (s *monitor) Reset() { 156 s.seenResources.Reset() 157 } 158 159 func (s *monitor) statusCheck(ctx context.Context, out io.Writer) (proto.StatusCode, error) { 160 client, err := kubernetesclient.Client(s.kubeContext) 161 if err != nil { 162 return proto.StatusCode_STATUSCHECK_KUBECTL_CLIENT_FETCH_ERR, fmt.Errorf("getting Kubernetes client: %w", err) 163 } 164 dynClient, err := kubernetesclient.DynamicClient(s.kubeContext) 165 if err != nil { 166 return proto.StatusCode_STATUSCHECK_KUBECTL_CLIENT_FETCH_ERR, fmt.Errorf("getting Kubernetes client: %w", err) 167 } 168 resources := make([]*resource.Resource, 0) 169 for _, n := range *s.namespaces { 170 newDeployments, err := getDeployments(ctx, client, n, s.labeller, getDeadline(s.deadlineSeconds)) 171 if err != nil { 172 return proto.StatusCode_STATUSCHECK_DEPLOYMENT_FETCH_ERR, fmt.Errorf("could not fetch deployments: %w", err) 173 } 174 for _, d := range newDeployments { 175 if s.seenResources.Contains(d) { 176 continue 177 } 178 resources = append(resources, d) 179 s.seenResources.Add(d) 180 } 181 182 newStatefulSets, err := getStatefulSets(ctx, client, n, s.labeller, getDeadline(s.deadlineSeconds)) 183 if err != nil { 184 return proto.StatusCode_STATUSCHECK_STATEFULSET_FETCH_ERR, fmt.Errorf("could not fetch statefulsets: %w", err) 185 } 186 for _, d := range newStatefulSets { 187 if s.seenResources.Contains(d) { 188 continue 189 } 190 resources = append(resources, d) 191 s.seenResources.Add(d) 192 } 193 194 newStandalonePods, err := getStandalonePods(ctx, client, n, s.labeller, getDeadline((s.deadlineSeconds))) 195 if err != nil { 196 return proto.StatusCode_STATUSCHECK_STANDALONE_PODS_FETCH_ERR, fmt.Errorf("could not fetch standalone pods: %w", err) 197 } 198 for _, pods := range newStandalonePods { 199 if s.seenResources.Contains(pods) { 200 continue 201 } 202 resources = append(resources, pods) 203 s.seenResources.Add(pods) 204 } 205 206 newConfigConnectorResources, err := getConfigConnectorResources(client, dynClient, s.manifests, n, s.labeller, getDeadline(s.deadlineSeconds)) 207 if err != nil { 208 return proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_RESOURCES_FETCH_ERR, fmt.Errorf("could not fetch config connector resources: %w", err) 209 } 210 for _, d := range newConfigConnectorResources { 211 if s.seenResources.Contains(d) { 212 continue 213 } 214 resources = append(resources, d) 215 s.seenResources.Add(d) 216 } 217 } 218 219 var wg sync.WaitGroup 220 c := newCounter(len(resources)) 221 222 ctx, cancel := context.WithCancel(ctx) 223 defer cancel() 224 var exitStatusOnce sync.Once 225 var exitStatus proto.StatusCode 226 227 for _, d := range resources { 228 wg.Add(1) 229 go func(r *resource.Resource) { 230 defer wg.Done() 231 // keep updating the resource status until it fails/succeeds/times out/cancelled. 232 pollResourceStatus(ctx, s.cfg, r) 233 rcCopy, failed := c.markProcessed(ctx, r.StatusCode()) 234 s.printStatusCheckSummary(out, r, rcCopy) 235 // if a resource fails, cancel status checks for all resources to fail fast 236 // and capture the first failed exit code. 237 if failed { 238 exitStatusOnce.Do(func() { 239 exitStatus = r.StatusCode() 240 }) 241 cancel() 242 } 243 }(d) 244 } 245 246 // Retrieve pending resource statuses 247 go func() { 248 s.printResourceStatus(ctx, out, resources) 249 }() 250 251 // Wait for all deployment statuses to be fetched 252 wg.Wait() 253 return getSkaffoldDeployStatus(ctx, c, exitStatus) 254 } 255 256 func getStandalonePods(ctx context.Context, client kubernetes.Interface, ns string, l *label.DefaultLabeller, deadlineDuration time.Duration) ([]*resource.Resource, error) { 257 var result []*resource.Resource 258 selector := validator.NewStandalonePodsSelector(client) 259 pods, err := selector.Select(ctx, ns, metav1.ListOptions{ 260 LabelSelector: l.RunIDSelector(), 261 }) 262 if err != nil { 263 return nil, fmt.Errorf("could not fetch standalone pods: %w", err) 264 } 265 if len(pods) == 0 { 266 return result, nil 267 } 268 pd := diag.New([]string{ns}). 269 WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]). 270 WithValidators([]validator.Validator{validator.NewPodValidator(client, selector)}) 271 result = append(result, resource.NewResource(string(resource.ResourceTypes.StandalonePods), resource.ResourceTypes.StandalonePods, ns, deadlineDuration).WithValidator(pd)) 272 273 return result, nil 274 } 275 276 func getConfigConnectorResources(client kubernetes.Interface, dynClient dynamic.Interface, m manifest.ManifestList, ns string, l *label.DefaultLabeller, deadlineDuration time.Duration) ([]*resource.Resource, error) { 277 var result []*resource.Resource 278 uRes, err := m.SelectResources(manifest.ConfigConnectorResourceSelector...) 279 if err != nil { 280 return nil, fmt.Errorf("could not fetch config connector resources: %w", err) 281 } 282 for _, r := range uRes { 283 resName := r.GroupVersionKind().String() 284 if r.GetName() != "" { 285 resName = fmt.Sprintf("%s, Name=%s", resName, r.GetName()) 286 } 287 pd := diag.New([]string{ns}). 288 WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]). 289 WithValidators([]validator.Validator{validator.NewConfigConnectorValidator(client, dynClient, r.GroupVersionKind())}) 290 result = append(result, resource.NewResource(resName, resource.ResourceTypes.ConfigConnector, ns, deadlineDuration).WithValidator(pd)) 291 } 292 293 return result, nil 294 } 295 296 func getDeployments(ctx context.Context, client kubernetes.Interface, ns string, l *label.DefaultLabeller, deadlineDuration time.Duration) ([]*resource.Resource, error) { 297 deps, err := client.AppsV1().Deployments(ns).List(ctx, metav1.ListOptions{ 298 LabelSelector: l.RunIDSelector(), 299 }) 300 if err != nil { 301 return nil, fmt.Errorf("could not fetch deployments: %w", err) 302 } 303 304 resources := make([]*resource.Resource, len(deps.Items)) 305 for i, d := range deps.Items { 306 var deadline time.Duration 307 if d.Spec.ProgressDeadlineSeconds == nil || *d.Spec.ProgressDeadlineSeconds == kubernetesMaxDeadline { 308 deadline = deadlineDuration 309 } else { 310 deadline = time.Duration(*d.Spec.ProgressDeadlineSeconds) * time.Second 311 } 312 313 pd := diag.New([]string{d.Namespace}). 314 WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]). 315 WithValidators([]validator.Validator{validator.NewPodValidator(client, validator.NewDeploymentPodsSelector(client, d))}) 316 317 for k, v := range d.Spec.Template.Labels { 318 pd = pd.WithLabel(k, v) 319 } 320 321 resources[i] = resource.NewResource(d.Name, resource.ResourceTypes.Deployment, d.Namespace, deadline).WithValidator(pd) 322 } 323 return resources, nil 324 } 325 326 func getStatefulSets(ctx context.Context, client kubernetes.Interface, ns string, l *label.DefaultLabeller, deadline time.Duration) ([]*resource.Resource, error) { 327 sets, err := client.AppsV1().StatefulSets(ns).List(ctx, metav1.ListOptions{ 328 LabelSelector: l.RunIDSelector(), 329 }) 330 if err != nil { 331 return nil, fmt.Errorf("could not fetch stateful sets: %w", err) 332 } 333 334 resources := make([]*resource.Resource, len(sets.Items)) 335 for i, ss := range sets.Items { 336 pd := diag.New([]string{ss.Namespace}). 337 WithLabel(label.RunIDLabel, l.Labels()[label.RunIDLabel]). 338 WithValidators([]validator.Validator{validator.NewPodValidator(client, validator.NewStatefulSetPodsSelector(client, ss))}) 339 340 for k, v := range ss.Spec.Template.Labels { 341 pd = pd.WithLabel(k, v) 342 } 343 344 resources[i] = resource.NewResource(ss.Name, resource.ResourceTypes.StatefulSet, ss.Namespace, deadline).WithValidator(pd) 345 } 346 return resources, nil 347 } 348 349 func pollResourceStatus(ctx context.Context, cfg kubectl.Config, r *resource.Resource) { 350 pollDuration := time.Duration(defaultPollPeriodInMilliseconds) * time.Millisecond 351 ticker := time.NewTicker(pollDuration) 352 defer ticker.Stop() 353 // Add poll duration to account for one last attempt after progressDeadlineSeconds. 354 timeoutContext, cancel := context.WithTimeout(ctx, r.Deadline()+pollDuration) 355 log.Entry(ctx).Debugf("checking status %s", r) 356 defer cancel() 357 for { 358 select { 359 case <-timeoutContext.Done(): 360 switch c := timeoutContext.Err(); c { 361 case context.Canceled: 362 r.UpdateStatus(&proto.ActionableErr{ 363 ErrCode: proto.StatusCode_STATUSCHECK_USER_CANCELLED, 364 Message: "check cancelled\n", 365 }) 366 case context.DeadlineExceeded: 367 r.UpdateStatus(&proto.ActionableErr{ 368 ErrCode: proto.StatusCode_STATUSCHECK_DEADLINE_EXCEEDED, 369 Message: fmt.Sprintf("could not stabilize within %v\n", r.Deadline()), 370 }) 371 } 372 return 373 case <-ticker.C: 374 r.CheckStatus(timeoutContext, cfg) 375 if r.IsStatusCheckCompleteOrCancelled() { 376 return 377 } 378 // Fail immediately if any pod container errors cannot be recovered. 379 // StatusCheck is not interruptable. 380 // As any changes to build or deploy dependencies are not triggered, exit 381 // immediately rather than waiting for for statusCheckDeadlineSeconds 382 // TODO: https://github.com/GoogleContainerTools/skaffold/pull/4591 383 if r.HasEncounteredUnrecoverableError() { 384 r.MarkComplete() 385 return 386 } 387 } 388 } 389 } 390 391 func getSkaffoldDeployStatus(ctx context.Context, c *counter, sc proto.StatusCode) (proto.StatusCode, error) { 392 if c.total == int(c.cancelled) && c.total > 0 { 393 err := fmt.Errorf("%d/%d deployment(s) status check cancelled", c.cancelled, c.total) 394 return proto.StatusCode_STATUSCHECK_USER_CANCELLED, err 395 } 396 // return success if no failures find. 397 if c.failed == 0 { 398 return proto.StatusCode_STATUSCHECK_SUCCESS, nil 399 } 400 // construct an error message and return appropriate error code 401 err := fmt.Errorf("%d/%d deployment(s) failed", c.failed, c.total) 402 if sc == proto.StatusCode_STATUSCHECK_SUCCESS || sc == 0 { 403 log.Entry(ctx).Debugf("found statuscode %s. setting skaffold deploy status to STATUSCHECK_INTERNAL_ERROR.", sc) 404 return proto.StatusCode_STATUSCHECK_INTERNAL_ERROR, err 405 } 406 log.Entry(ctx).Debugf("setting skaffold deploy status to %s.", sc) 407 return sc, err 408 } 409 410 func getDeadline(d int) time.Duration { 411 if d > 0 { 412 return time.Duration(d) * time.Second 413 } 414 return DefaultStatusCheckDeadline 415 } 416 417 func (s *monitor) printStatusCheckSummary(out io.Writer, r *resource.Resource, c counter) { 418 ae := r.Status().ActionableError() 419 if r.StatusCode() == proto.StatusCode_STATUSCHECK_USER_CANCELLED { 420 // Don't print the status summary if the user ctrl-C or 421 // another deployment failed 422 return 423 } 424 event.ResourceStatusCheckEventCompleted(r.String(), ae) 425 eventV2.ResourceStatusCheckEventCompleted(r.String(), sErrors.V2fromV1(ae)) 426 out, _ = output.WithEventContext(context.Background(), out, constants.Deploy, r.String()) 427 status := fmt.Sprintf("%s %s", tabHeader, r) 428 if ae.ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS { 429 if str := r.ReportSinceLastUpdated(s.muteLogs); str != "" { 430 fmt.Fprintln(out, trimNewLine(str)) 431 } 432 status = fmt.Sprintf("%s failed. Error: %s.", 433 status, 434 trimNewLine(r.StatusMessage()), 435 ) 436 } else { 437 status = fmt.Sprintf("%s is ready.%s", status, getPendingMessage(c.pending, c.total)) 438 } 439 440 fmt.Fprintln(out, status) 441 } 442 443 // printResourceStatus prints resource statuses until all status check are completed or context is cancelled. 444 func (s *monitor) printResourceStatus(ctx context.Context, out io.Writer, resources []*resource.Resource) { 445 ticker := time.NewTicker(reportStatusTime) 446 defer ticker.Stop() 447 for { 448 var allDone bool 449 select { 450 case <-ctx.Done(): 451 return 452 case <-ticker.C: 453 allDone = s.printStatus(resources, out) 454 } 455 if allDone { 456 return 457 } 458 } 459 } 460 461 func (s *monitor) printStatus(resources []*resource.Resource, out io.Writer) bool { 462 allDone := true 463 for _, r := range resources { 464 if r.IsStatusCheckCompleteOrCancelled() { 465 continue 466 } 467 allDone = false 468 if str := r.ReportSinceLastUpdated(s.muteLogs); str != "" { 469 ae := r.Status().ActionableError() 470 event.ResourceStatusCheckEventUpdated(r.String(), ae) 471 eventV2.ResourceStatusCheckEventUpdated(r.String(), sErrors.V2fromV1(ae)) 472 out, _ := output.WithEventContext(context.Background(), out, constants.Deploy, r.String()) 473 fmt.Fprintln(out, trimNewLine(str)) 474 } 475 } 476 return allDone 477 } 478 479 func getPendingMessage(pending int32, total int) string { 480 if pending > 0 { 481 return fmt.Sprintf(" [%d/%d deployment(s) still pending]", pending, total) 482 } 483 return "" 484 } 485 486 func trimNewLine(msg string) string { 487 return strings.TrimSuffix(msg, "\n") 488 } 489 490 func newCounter(i int) *counter { 491 return &counter{ 492 total: i, 493 pending: int32(i), 494 } 495 } 496 497 func (c *counter) markProcessed(ctx context.Context, sc proto.StatusCode) (counter, bool) { 498 atomic.AddInt32(&c.pending, -1) 499 if ctx.Err() == context.Canceled { 500 log.Entry(ctx).Debug("marking resource status check cancelled", sc) 501 atomic.AddInt32(&c.cancelled, 1) 502 return c.copy(), false 503 } else if sc == proto.StatusCode_STATUSCHECK_SUCCESS { 504 return c.copy(), false 505 } 506 log.Entry(ctx).Debugf("marking resource failed due to error code %s", sc) 507 atomic.AddInt32(&c.failed, 1) 508 return c.copy(), true 509 } 510 511 func (c *counter) copy() counter { 512 return counter{ 513 total: c.total, 514 pending: c.pending, 515 failed: c.failed, 516 cancelled: c.cancelled, 517 } 518 } 519 520 type NoopMonitor struct { 521 status.NoopMonitor 522 } 523 524 func (n *NoopMonitor) RegisterDeployManifests(manifest.ManifestList) {}