github.com/GoogleContainerTools/skaffold@v1.39.18/pkg/skaffold/kubernetes/status/resource/deployment.go (about) 1 /* 2 Copyright 2019 The Skaffold Authors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package resource 18 19 import ( 20 "context" 21 "fmt" 22 "regexp" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/GoogleContainerTools/skaffold/pkg/diag" 28 "github.com/GoogleContainerTools/skaffold/pkg/diag/validator" 29 sErrors "github.com/GoogleContainerTools/skaffold/pkg/skaffold/errors" 30 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/event" 31 eventV2 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/event/v2" 32 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubectl" 33 "github.com/GoogleContainerTools/skaffold/pkg/skaffold/output/log" 34 "github.com/GoogleContainerTools/skaffold/proto/v1" 35 protoV2 "github.com/GoogleContainerTools/skaffold/proto/v2" 36 ) 37 38 const ( 39 deploymentRolloutSuccess = "successfully rolled out" 40 connectionErrMsg = "Unable to connect to the server" 41 killedErrMsg = "signal: killed" 42 defaultPodCheckDeadline = 30 * time.Second 43 tabHeader = " -" 44 tab = " " 45 maxLogLines = 3 46 ) 47 48 // Type represents a kubernetes resource type to health check. 49 type Type string 50 51 var ( 52 statefulsetRolloutSuccess = regexp.MustCompile("(roll out|rolling update) complete") 53 54 msgKubectlKilled = "kubectl rollout status command interrupted\n" 55 MsgKubectlConnection = "kubectl connection error\n" 56 57 nonRetryContainerErrors = map[proto.StatusCode]struct{}{ 58 proto.StatusCode_STATUSCHECK_IMAGE_PULL_ERR: {}, 59 proto.StatusCode_STATUSCHECK_RUN_CONTAINER_ERR: {}, 60 proto.StatusCode_STATUSCHECK_CONTAINER_TERMINATED: {}, 61 proto.StatusCode_STATUSCHECK_CONTAINER_RESTARTING: {}, 62 } 63 64 ResourceTypes = struct { 65 StandalonePods Type 66 Deployment Type 67 StatefulSet Type 68 ConfigConnector Type 69 }{ 70 StandalonePods: "standalone-pods", 71 Deployment: "deployment", 72 StatefulSet: "statefulset", 73 ConfigConnector: "config-connector-resource", 74 } 75 ) 76 77 type Group map[string]*Resource 78 79 func (r Group) Add(d *Resource) { 80 r[d.ID()] = d 81 } 82 83 func (r Group) Contains(d *Resource) bool { 84 _, found := r[d.ID()] 85 return found 86 } 87 88 func (r Group) Reset() { 89 for k := range r { 90 delete(r, k) 91 } 92 } 93 94 type Resource struct { 95 name string 96 namespace string 97 rType Type 98 status Status 99 statusCode proto.StatusCode 100 done bool 101 deadline time.Duration 102 resources map[string]validator.Resource 103 resoureValidator diag.Diagnose 104 } 105 106 func (r *Resource) ID() string { 107 return fmt.Sprintf("%s:%s:%s", r.name, r.namespace, r.rType) 108 } 109 110 func (r *Resource) Deadline() time.Duration { 111 return r.deadline 112 } 113 114 func (r *Resource) UpdateStatus(ae *proto.ActionableErr) { 115 updated := newStatus(ae) 116 if r.status.Equal(updated) { 117 r.status.changed = false 118 return 119 } 120 r.status = updated 121 r.statusCode = updated.ActionableError().ErrCode 122 r.status.changed = true 123 if ae.ErrCode == proto.StatusCode_STATUSCHECK_SUCCESS || isErrAndNotRetryAble(ae.ErrCode) { 124 r.done = true 125 } 126 } 127 128 func NewResource(name string, rType Type, ns string, deadline time.Duration) *Resource { 129 return &Resource{ 130 name: name, 131 namespace: ns, 132 rType: rType, 133 status: newStatus(&proto.ActionableErr{}), 134 deadline: deadline, 135 resoureValidator: diag.New(nil), 136 } 137 } 138 139 func (r *Resource) WithValidator(pd diag.Diagnose) *Resource { 140 r.resoureValidator = pd 141 return r 142 } 143 144 func (r *Resource) checkStandalonePodsStatus(ctx context.Context, cfg kubectl.Config) *proto.ActionableErr { 145 if len(r.resources) == 0 { 146 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_STANDALONE_PODS_PENDING} 147 } 148 kubeCtl := kubectl.NewCLI(cfg, "") 149 var pendingPods []string 150 for _, pod := range r.resources { 151 switch pod.Status() { 152 case "Failed": 153 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_UNKNOWN, Message: fmt.Sprintf("pod %s failed", pod.Name())} 154 case "Running": 155 b, _ := kubeCtl.RunOut(ctx, "get", "pod", pod.Name(), "-o", `jsonpath={..status.conditions[?(@.type=="Ready")].status}`, "--namespace", pod.Namespace()) 156 if ctx.Err() != nil { 157 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_USER_CANCELLED} 158 } 159 if podReady, _ := strconv.ParseBool(string(b)); !podReady { 160 pendingPods = append(pendingPods, pod.Name()) 161 } 162 default: 163 pendingPods = append(pendingPods, pod.Name()) 164 } 165 } 166 if len(pendingPods) > 0 { 167 return &proto.ActionableErr{ 168 ErrCode: proto.StatusCode_STATUSCHECK_STANDALONE_PODS_PENDING, 169 Message: fmt.Sprintf("pods not ready: %v", pendingPods), 170 } 171 } 172 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS} 173 } 174 175 func (r *Resource) checkConfigConnectorStatus() *proto.ActionableErr { 176 if len(r.resources) == 0 { 177 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_IN_PROGRESS} 178 } 179 var pendingResources []string 180 for _, resource := range r.resources { 181 ae := resource.ActionableError() 182 if ae == nil { 183 continue 184 } 185 switch ae.ErrCode { 186 case proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_FAILED, proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_TERMINATING: 187 return ae 188 case proto.StatusCode_STATUSCHECK_SUCCESS: 189 continue 190 default: 191 pendingResources = append(pendingResources, resource.Name()) 192 } 193 } 194 if len(pendingResources) > 0 { 195 return &proto.ActionableErr{ 196 ErrCode: proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_IN_PROGRESS, 197 Message: fmt.Sprintf("config connector resources not ready: %v", pendingResources), 198 } 199 } 200 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS} 201 } 202 203 func (r *Resource) checkRolloutStatus(ctx context.Context, cfg kubectl.Config) *proto.ActionableErr { 204 kubeCtl := kubectl.NewCLI(cfg, "") 205 206 b, err := kubeCtl.RunOut(ctx, "rollout", "status", string(r.rType), r.name, "--namespace", r.namespace, "--watch=false") 207 if ctx.Err() != nil { 208 return &proto.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_USER_CANCELLED} 209 } 210 211 details := r.cleanupStatus(string(b)) 212 return parseKubectlRolloutError(details, r.deadline, err) 213 } 214 215 func (r *Resource) CheckStatus(ctx context.Context, cfg kubectl.Config) { 216 var ae *proto.ActionableErr 217 switch r.rType { 218 case ResourceTypes.StandalonePods: 219 ae = r.checkStandalonePodsStatus(ctx, cfg) 220 case ResourceTypes.ConfigConnector: 221 ae = r.checkConfigConnectorStatus() 222 default: 223 ae = r.checkRolloutStatus(ctx, cfg) 224 } 225 226 r.UpdateStatus(ae) 227 // send event update in check status. 228 // if deployment is successfully rolled out, send pod success event to make sure 229 // all pod are marked as success in V2 230 // See https://github.com/GoogleCloudPlatform/cloud-code-vscode-internal/issues/5277 231 if ae.ErrCode == proto.StatusCode_STATUSCHECK_SUCCESS { 232 for _, pod := range r.resources { 233 eventV2.ResourceStatusCheckEventCompletedMessage( 234 pod.String(), 235 fmt.Sprintf("%s %s: running.\n", tabHeader, pod.String()), 236 &protoV2.ActionableErr{ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS}, 237 ) 238 } 239 return 240 } 241 if err := r.fetchPods(ctx); err != nil { 242 log.Entry(ctx).Debugf("pod statuses could not be fetched this time due to %s", err) 243 } 244 } 245 246 func (r *Resource) String() string { 247 switch r.rType { 248 case ResourceTypes.StandalonePods: 249 return "pods" 250 default: 251 if r.namespace == "default" { 252 return fmt.Sprintf("%s/%s", r.rType, r.name) 253 } 254 255 return fmt.Sprintf("%s:%s/%s", r.namespace, r.rType, r.name) 256 } 257 } 258 259 func (r *Resource) Name() string { 260 return r.name 261 } 262 263 func (r *Resource) Status() Status { 264 return r.status 265 } 266 267 func (r *Resource) IsStatusCheckCompleteOrCancelled() bool { 268 return r.done || r.statusCode == proto.StatusCode_STATUSCHECK_USER_CANCELLED 269 } 270 271 func (r *Resource) StatusMessage() string { 272 for _, p := range r.resources { 273 if s := p.ActionableError(); s.ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS { 274 return fmt.Sprintf("%s\n", s.Message) 275 } 276 } 277 return r.status.String() 278 } 279 280 func (r *Resource) MarkComplete() { 281 r.done = true 282 } 283 284 // ReportSinceLastUpdated returns a string representing rollout status along with tab header 285 // e.g. 286 // - testNs:deployment/leeroy-app: waiting for rollout to complete. (1/2) pending 287 // - testNs:pod/leeroy-app-xvbg : error pulling container image 288 func (r *Resource) ReportSinceLastUpdated(isMuted bool) string { 289 if r.status.reported && !r.status.changed { 290 return "" 291 } 292 r.status.reported = true 293 if r.status.String() == "" { 294 return "" 295 } 296 var result strings.Builder 297 // Pod container statuses can be empty. 298 // This can happen when 299 // 1. No pods have been scheduled for the rollout 300 // 2. All containers are in running phase with no errors. 301 // In such case, avoid printing any status update for the rollout. 302 for _, p := range r.resources { 303 if s := p.ActionableError().Message; s != "" { 304 result.WriteString(fmt.Sprintf("%s %s %s: %s\n", tab, tabHeader, p, s)) 305 // if logs are muted, write container logs to file and last 3 lines to 306 // result. 307 out, writeTrimLines, err := withLogFile(p.Name(), &result, p.Logs(), isMuted) 308 if err != nil { 309 log.Entry(context.TODO()).Debugf("could not create log file %v", err) 310 } 311 trimLines := []string{} 312 for i, l := range p.Logs() { 313 formattedLine := fmt.Sprintf("%s %s > %s\n", tab, tab, strings.TrimSuffix(l, "\n")) 314 if isMuted && i >= len(p.Logs())-maxLogLines { 315 trimLines = append(trimLines, formattedLine) 316 } 317 out.Write([]byte(formattedLine)) 318 } 319 writeTrimLines(trimLines) 320 } 321 } 322 return fmt.Sprintf("%s %s: %s%s", tabHeader, r, r.StatusMessage(), result.String()) 323 } 324 325 func (r *Resource) cleanupStatus(msg string) string { 326 switch r.rType { 327 case ResourceTypes.Deployment: 328 clean := strings.ReplaceAll(msg, `deployment "`+r.Name()+`" `, "") 329 if len(clean) > 0 { 330 clean = strings.ToLower(clean[0:1]) + clean[1:] 331 } 332 return clean 333 default: 334 return msg 335 } 336 } 337 338 // parses out connection error 339 // $kubectl logs somePod -f 340 // Unable to connect to the server: dial tcp x.x.x.x:443: connect: network is unreachable 341 342 // Parses out errors when kubectl was killed on client side 343 // $kubectl logs testPod -f 344 // 2020/06/18 17:28:31 service is running 345 // Killed: 9 346 func parseKubectlRolloutError(details string, deadline time.Duration, err error) *proto.ActionableErr { 347 switch { 348 // deployment rollouts have success messages like `deployment "skaffold-foo" successfully rolled out` 349 case err == nil && strings.Contains(details, deploymentRolloutSuccess): 350 return &proto.ActionableErr{ 351 ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS, 352 Message: details, 353 } 354 // statefulset rollouts have success messages like `statefulset rolling update complete 2 pods at revision skaffold-foo` 355 case err == nil && statefulsetRolloutSuccess.MatchString(details): 356 return &proto.ActionableErr{ 357 ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS, 358 Message: details, 359 } 360 case err == nil: 361 return &proto.ActionableErr{ 362 ErrCode: proto.StatusCode_STATUSCHECK_DEPLOYMENT_ROLLOUT_PENDING, 363 Message: details, 364 } 365 case strings.Contains(err.Error(), connectionErrMsg): 366 return &proto.ActionableErr{ 367 ErrCode: proto.StatusCode_STATUSCHECK_KUBECTL_CONNECTION_ERR, 368 Message: MsgKubectlConnection, 369 } 370 case strings.Contains(err.Error(), killedErrMsg): 371 return &proto.ActionableErr{ 372 ErrCode: proto.StatusCode_STATUSCHECK_KUBECTL_PID_KILLED, 373 Message: fmt.Sprintf("received Ctrl-C or deployments could not stabilize within %v: %s", deadline, msgKubectlKilled), 374 } 375 default: 376 return &proto.ActionableErr{ 377 ErrCode: proto.StatusCode_STATUSCHECK_UNKNOWN, 378 Message: err.Error(), 379 } 380 } 381 } 382 383 func isErrAndNotRetryAble(statusCode proto.StatusCode) bool { 384 return statusCode != proto.StatusCode_STATUSCHECK_KUBECTL_CONNECTION_ERR && 385 statusCode != proto.StatusCode_STATUSCHECK_DEPLOYMENT_ROLLOUT_PENDING && 386 statusCode != proto.StatusCode_STATUSCHECK_STANDALONE_PODS_PENDING && 387 statusCode != proto.StatusCode_STATUSCHECK_CONFIG_CONNECTOR_IN_PROGRESS 388 } 389 390 // HasEncounteredUnrecoverableError goes through all pod statuses and return true 391 // if any cannot be recovered 392 func (r *Resource) HasEncounteredUnrecoverableError() bool { 393 for _, p := range r.resources { 394 if _, ok := nonRetryContainerErrors[p.ActionableError().ErrCode]; ok { 395 return true 396 } 397 } 398 return false 399 } 400 401 func (r *Resource) fetchPods(ctx context.Context) error { 402 timeoutContext, cancel := context.WithTimeout(ctx, defaultPodCheckDeadline) 403 defer cancel() 404 pods, err := r.resoureValidator.Run(timeoutContext) 405 if err != nil { 406 return err 407 } 408 409 newResources := map[string]validator.Resource{} 410 r.status.changed = false 411 for _, p := range pods { 412 originalPod, found := r.resources[p.String()] 413 if !found || originalPod.StatusUpdated(p) { 414 r.status.changed = true 415 prefix := fmt.Sprintf("%s %s:", tabHeader, p.String()) 416 if p.ActionableError().ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS && 417 p.ActionableError().Message != "" { 418 event.ResourceStatusCheckEventUpdated(p.String(), p.ActionableError()) 419 eventV2.ResourceStatusCheckEventUpdatedMessage( 420 p.String(), 421 prefix, 422 sErrors.V2fromV1(p.ActionableError())) 423 } 424 } 425 newResources[p.String()] = p 426 } 427 r.resources = newResources 428 return nil 429 } 430 431 // StatusCode returns the rollout status code if the status check is cancelled 432 // or if no pod data exists for this rollout. 433 // If pods are fetched, this function returns the error code a pod container encountered. 434 func (r *Resource) StatusCode() proto.StatusCode { 435 // do not process pod status codes 436 // 1) the user aborted the run or 437 // 2) if another rollout failed which cancelled this deployment status check 438 // 3) the deployment is successful. In case of successful rollouts, the code doesn't fetch the updated pod statuses. 439 if r.statusCode == proto.StatusCode_STATUSCHECK_USER_CANCELLED || r.statusCode == proto.StatusCode_STATUSCHECK_SUCCESS { 440 return r.statusCode 441 } 442 for _, p := range r.resources { 443 if s := p.ActionableError().ErrCode; s != proto.StatusCode_STATUSCHECK_SUCCESS { 444 return s 445 } 446 } 447 return r.statusCode 448 } 449 450 func (r *Resource) WithPodStatuses(scs []proto.StatusCode) *Resource { 451 r.resources = map[string]validator.Resource{} 452 for i, s := range scs { 453 name := fmt.Sprintf("%s-%d", r.name, i) 454 r.resources[name] = validator.NewResource("test", "pod", "foo", validator.Status("failed"), 455 &proto.ActionableErr{Message: "pod failed", ErrCode: s}, nil) 456 } 457 return r 458 }