github.com/GoogleContainerTools/skaffold/v2@v2.13.2/pkg/diag/validator/validator.go (about) 1 /* 2 Copyright 2019 The Skaffold Authors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package validator 18 19 import ( 20 "context" 21 "fmt" 22 "os/exec" 23 "regexp" 24 "strings" 25 26 v1 "k8s.io/api/core/v1" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/runtime" 29 "k8s.io/client-go/kubernetes" 30 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 31 deploymentutil "k8s.io/kubectl/pkg/util/deployment" 32 33 "github.com/GoogleContainerTools/skaffold/v2/pkg/diag/recommender" 34 "github.com/GoogleContainerTools/skaffold/v2/pkg/skaffold/output/log" 35 proto "github.com/GoogleContainerTools/skaffold/v2/proto/v1" 36 ) 37 38 const ( 39 success = "Succeeded" 40 running = "Running" 41 actionableMessage = `could not determine pod status. Try kubectl describe -n %s po/%s` 42 errorPrefix = `(?P<Prefix>)(?P<DaemonLog>Error response from daemon\:)(?P<Error>.*)` 43 taintsExp = `\{(?P<taint>.*?):.*?}` 44 crashLoopBackOff = "CrashLoopBackOff" 45 runContainerError = "RunContainerError" 46 ImagePullErr = "ErrImagePull" 47 ImagePullBackOff = "ImagePullBackOff" 48 ErrImagePullBackOff = "ErrImagePullBackOff" 49 50 ReplicaFailureAdmissionErr = "ReplicaFailureAdmissionErr" 51 containerCreating = "ContainerCreating" 52 podInitializing = "PodInitializing" 53 podKind = "pod" 54 55 failedScheduling = "FailedScheduling" 56 unhealthy = "Unhealthy" 57 execFmtError = "exec format error" 58 ) 59 60 var ( 61 runContainerRe = regexp.MustCompile(errorPrefix) 62 taintsRe = regexp.MustCompile(taintsExp) 63 // for testing 64 runCli = executeCLI 65 getReplicaSet = deploymentutil.GetAllReplicaSets 66 67 unknownConditionsOrSuccess = map[proto.StatusCode]struct{}{ 68 proto.StatusCode_STATUSCHECK_UNKNOWN: {}, 69 proto.StatusCode_STATUSCHECK_CONTAINER_WAITING_UNKNOWN: {}, 70 proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE: {}, 71 proto.StatusCode_STATUSCHECK_SUCCESS: {}, 72 proto.StatusCode_STATUSCHECK_POD_INITIALIZING: {}, 73 } 74 ) 75 76 // PodValidator implements the Validator interface for Pods 77 type PodValidator struct { 78 k kubernetes.Interface 79 podSelector PodSelector 80 recos []Recommender 81 } 82 83 // NewPodValidator initializes a PodValidator 84 func NewPodValidator(k kubernetes.Interface, s PodSelector) *PodValidator { 85 rs := []Recommender{recommender.ContainerError{}} 86 return &PodValidator{k: k, recos: rs, podSelector: s} 87 } 88 89 // Validate implements the Validate method for Validator interface 90 func (p *PodValidator) Validate(ctx context.Context, ns string, opts metav1.ListOptions) ([]Resource, error) { 91 pods, err := p.podSelector.Select(ctx, ns, opts) 92 if err != nil { 93 return []Resource{}, err 94 } 95 eventsClient := p.k.CoreV1().Events(ns) 96 var rs []Resource 97 for _, po := range pods { 98 ps := p.getPodStatus(&po) 99 // Update Pod status from Pod events if required 100 updated := processPodEvents(eventsClient, po, ps) 101 // The GVK group is not populated for List Objects. Hence set `kind` to `pod` 102 // See https://github.com/kubernetes-sigs/controller-runtime/pull/389 103 if po.Kind == "" { 104 po.Kind = podKind 105 } 106 // Add recommendations 107 for _, r := range p.recos { 108 if s := r.Make(updated.ae.ErrCode); s.SuggestionCode != proto.SuggestionCode_NIL { 109 updated.ae.Suggestions = append(updated.ae.Suggestions, s) 110 } 111 } 112 rs = append(rs, NewResourceFromObject(&po, Status(updated.phase), &updated.ae, updated.logs)) 113 } 114 return rs, nil 115 } 116 117 func (p *PodValidator) getPodStatus(pod *v1.Pod) *podStatus { 118 ps := newPodStatus(pod.Name, pod.Namespace, string(pod.Status.Phase)) 119 switch pod.Status.Phase { 120 case v1.PodSucceeded: 121 return ps 122 default: 123 return ps.withErrAndLogs(getPodStatus(pod)) 124 } 125 } 126 127 func getPodStatus(pod *v1.Pod) (proto.StatusCode, []string, error) { 128 // See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions 129 130 // If the event type PodReady with status True is found then we return success immediately 131 if isPodReady(pod) { 132 return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil 133 } 134 // If the event type PodScheduled with status False is found then we check if it is due to taints and tolerations. 135 if c, ok := isPodNotScheduled(pod); ok { 136 log.Entry(context.TODO()).Debugf("Pod %q not scheduled: checking tolerations", pod.Name) 137 sc, err := getUntoleratedTaints(c.Reason, c.Message) 138 return sc, nil, err 139 } 140 // we can check the container status if the pod has been scheduled successfully. This can be determined by having the event 141 // PodScheduled with status True, or a ContainerReady or PodReady event with status False. 142 if isPodScheduledButNotReady(pod) { 143 log.Entry(context.TODO()).Debugf("Pod %q scheduled but not ready: checking container statuses", pod.Name) 144 // TODO(dgageot): Add EphemeralContainerStatuses 145 cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) 146 // See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states 147 statusCode, logs, err := getContainerStatus(pod, cs) 148 if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING { 149 // Determine if an init container is still running and fetch the init logs. 150 for _, c := range pod.Status.InitContainerStatuses { 151 if c.State.Waiting != nil { 152 return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name) 153 } else if c.State.Running != nil { 154 sc, l := getPodLogs(pod, c.Name, statusCode) 155 return sc, l, fmt.Errorf("waiting for init container %s to complete", c.Name) 156 } 157 } 158 } 159 return statusCode, logs, err 160 } 161 162 if c, ok := isPodStatusUnknown(pod); ok { 163 log.Entry(context.TODO()).Debugf("Pod %q condition status of type %s is unknown", pod.Name, c.Type) 164 return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message) 165 } 166 167 log.Entry(context.TODO()).Debugf("Unable to determine current service state of pod %q", pod.Name) 168 return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf("unable to determine current service state of pod %q", pod.Name) 169 } 170 171 func isPodReady(pod *v1.Pod) bool { 172 for _, c := range pod.Status.Conditions { 173 if c.Type == v1.PodReady && c.Status == v1.ConditionTrue { 174 return true 175 } 176 } 177 return false 178 } 179 180 func isPodNotScheduled(pod *v1.Pod) (v1.PodCondition, bool) { 181 for _, c := range pod.Status.Conditions { 182 if c.Type == v1.PodScheduled && c.Status == v1.ConditionFalse { 183 return c, true 184 } 185 } 186 return v1.PodCondition{}, false 187 } 188 189 func isPodScheduledButNotReady(pod *v1.Pod) bool { 190 for _, c := range pod.Status.Conditions { 191 if c.Type == v1.PodScheduled && c.Status == v1.ConditionTrue { 192 return true 193 } 194 if c.Type == v1.ContainersReady && c.Status == v1.ConditionFalse { 195 return true 196 } 197 if c.Type == v1.PodReady && c.Status == v1.ConditionFalse { 198 return true 199 } 200 } 201 return false 202 } 203 204 func isPodStatusUnknown(pod *v1.Pod) (v1.PodCondition, bool) { 205 for _, c := range pod.Status.Conditions { 206 if c.Status == v1.ConditionUnknown { 207 return c, true 208 } 209 } 210 return v1.PodCondition{}, false 211 } 212 213 func getContainerStatus(po *v1.Pod, cs []v1.ContainerStatus) (proto.StatusCode, []string, error) { 214 // See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states 215 for _, c := range cs { 216 switch { 217 case c.State.Waiting != nil: 218 return extractErrorMessageFromWaitingContainerStatus(po, c) 219 case c.State.Terminated != nil && c.State.Terminated.ExitCode != 0: 220 sc, l := getPodLogs(po, c.Name, proto.StatusCode_STATUSCHECK_CONTAINER_TERMINATED) 221 return sc, l, fmt.Errorf("container %s terminated with exit code %d", c.Name, c.State.Terminated.ExitCode) 222 } 223 } 224 // No waiting or terminated containers, pod should be in good health. 225 return proto.StatusCode_STATUSCHECK_SUCCESS, nil, nil 226 } 227 228 func getUntoleratedTaints(reason string, message string) (proto.StatusCode, error) { 229 matches := taintsRe.FindAllStringSubmatch(message, -1) 230 errCode := proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE 231 if len(matches) == 0 { 232 return errCode, fmt.Errorf("%s: %s", reason, message) 233 } 234 messages := make([]string, len(matches)) 235 // TODO: Add actionable item to fix these errors. 236 for i, m := range matches { 237 if len(m) < 2 { 238 continue 239 } 240 t := m[1] 241 switch t { 242 case v1.TaintNodeMemoryPressure: 243 messages[i] = "1 node has memory pressure" 244 errCode = proto.StatusCode_STATUSCHECK_NODE_MEMORY_PRESSURE 245 case v1.TaintNodeDiskPressure: 246 messages[i] = "1 node has disk pressure" 247 errCode = proto.StatusCode_STATUSCHECK_NODE_DISK_PRESSURE 248 case v1.TaintNodePIDPressure: 249 messages[i] = "1 node has PID pressure" 250 errCode = proto.StatusCode_STATUSCHECK_NODE_PID_PRESSURE 251 case v1.TaintNodeNotReady: 252 messages[i] = "1 node is not ready" 253 if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE { 254 errCode = proto.StatusCode_STATUSCHECK_NODE_NOT_READY 255 } 256 case v1.TaintNodeUnreachable: 257 messages[i] = "1 node is unreachable" 258 if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE { 259 errCode = proto.StatusCode_STATUSCHECK_NODE_UNREACHABLE 260 } 261 case v1.TaintNodeUnschedulable: 262 messages[i] = "1 node is unschedulable" 263 if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE { 264 errCode = proto.StatusCode_STATUSCHECK_NODE_UNSCHEDULABLE 265 } 266 case v1.TaintNodeNetworkUnavailable: 267 messages[i] = "1 node's network not available" 268 if errCode == proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE { 269 errCode = proto.StatusCode_STATUSCHECK_NODE_NETWORK_UNAVAILABLE 270 } 271 } 272 } 273 return errCode, fmt.Errorf("%s: 0/%d nodes available: %s", reason, len(messages), strings.Join(messages, ", ")) 274 } 275 276 func processPodEvents(e corev1.EventInterface, pod v1.Pod, ps *podStatus) *podStatus { 277 updated := ps 278 if _, ok := unknownConditionsOrSuccess[ps.ae.ErrCode]; !ok { 279 return updated 280 } 281 log.Entry(context.TODO()).Debugf("Fetching events for pod %q", pod.Name) 282 // Get pod events. 283 scheme := runtime.NewScheme() 284 scheme.AddKnownTypes(v1.SchemeGroupVersion, &pod) 285 events, err := e.Search(scheme, &pod) 286 if err != nil { 287 log.Entry(context.TODO()).Debugf("Could not fetch events for resource %q due to %v", pod.Name, err) 288 return updated 289 } 290 // find the latest event. 291 var recentEvent *v1.Event 292 for _, e := range events.Items { 293 event := e.DeepCopy() 294 if recentEvent == nil || recentEvent.LastTimestamp.Before(&event.LastTimestamp) { 295 recentEvent = event 296 } 297 } 298 if recentEvent == nil || recentEvent.Type == v1.EventTypeNormal { 299 return updated 300 } 301 switch recentEvent.Reason { 302 case failedScheduling: 303 updated.updateAE(proto.StatusCode_STATUSCHECK_FAILED_SCHEDULING, recentEvent.Message) 304 case unhealthy: 305 updated.updateAE(proto.StatusCode_STATUSCHECK_UNHEALTHY, recentEvent.Message) 306 default: 307 // TODO: Add unique error codes for reasons 308 updated.updateAE( 309 proto.StatusCode_STATUSCHECK_UNKNOWN_EVENT, 310 fmt.Sprintf("%s: %s", recentEvent.Reason, recentEvent.Message), 311 ) 312 } 313 314 return updated 315 } 316 317 type podStatus struct { 318 name string 319 namespace string 320 phase string 321 logs []string 322 ae proto.ActionableErr 323 } 324 325 func (p *podStatus) isStable() bool { 326 return p.phase == success || (p.phase == running && p.ae.Message == "") 327 } 328 329 func (p *podStatus) withErrAndLogs(errCode proto.StatusCode, l []string, err error) *podStatus { 330 var msg string 331 if err != nil { 332 msg = err.Error() 333 } 334 p.updateAE(errCode, msg) 335 p.logs = l 336 return p 337 } 338 339 func (p *podStatus) updateAE(errCode proto.StatusCode, msg string) { 340 p.ae.ErrCode = errCode 341 p.ae.Message = msg 342 } 343 344 func (p *podStatus) String() string { 345 switch { 346 case p.isStable(): 347 return "" 348 default: 349 if p.ae.Message != "" { 350 return p.ae.Message 351 } 352 } 353 return fmt.Sprintf(actionableMessage, p.namespace, p.name) 354 } 355 356 func extractErrorMessageFromWaitingContainerStatus(po *v1.Pod, c v1.ContainerStatus) (proto.StatusCode, []string, error) { 357 // Extract meaning full error out of container statuses. 358 switch c.State.Waiting.Reason { 359 case podInitializing: 360 // container is waiting to run. This could be because one of the init containers is 361 // still not completed 362 return proto.StatusCode_STATUSCHECK_POD_INITIALIZING, nil, nil 363 case containerCreating: 364 return proto.StatusCode_STATUSCHECK_CONTAINER_CREATING, nil, fmt.Errorf("creating container %s", c.Name) 365 case crashLoopBackOff: 366 // TODO, in case of container restarting, return the original failure reason due to which container failed. 367 sc, l := getPodLogs(po, c.Name, proto.StatusCode_STATUSCHECK_CONTAINER_RESTARTING) 368 return sc, l, fmt.Errorf("container %s is backing off waiting to restart", c.Name) 369 case ImagePullErr, ImagePullBackOff, ErrImagePullBackOff: 370 return proto.StatusCode_STATUSCHECK_IMAGE_PULL_ERR, nil, fmt.Errorf("container %s is waiting to start: %s can't be pulled", c.Name, c.Image) 371 case runContainerError: 372 match := runContainerRe.FindStringSubmatch(c.State.Waiting.Message) 373 if len(match) != 0 { 374 return proto.StatusCode_STATUSCHECK_RUN_CONTAINER_ERR, nil, fmt.Errorf("container %s in error: %s", c.Name, trimSpace(match[3])) 375 } 376 } 377 log.Entry(context.TODO()).Debugf("Unknown waiting reason for container %q: %v", c.Name, c.State) 378 return proto.StatusCode_STATUSCHECK_CONTAINER_WAITING_UNKNOWN, nil, fmt.Errorf("container %s in error: %v", c.Name, c.State.Waiting) 379 } 380 381 func newPodStatus(n string, ns string, p string) *podStatus { 382 return &podStatus{ 383 name: n, 384 namespace: ns, 385 phase: p, 386 ae: proto.ActionableErr{ 387 ErrCode: proto.StatusCode_STATUSCHECK_SUCCESS, 388 }, 389 } 390 } 391 392 func trimSpace(msg string) string { 393 return strings.Trim(msg, " ") 394 } 395 396 func getPodLogs(po *v1.Pod, c string, sc proto.StatusCode) (proto.StatusCode, []string) { 397 log.Entry(context.TODO()).Debugf("Fetching logs for container %s/%s", po.Name, c) 398 logCommand := []string{"kubectl", "logs", po.Name, "-n", po.Namespace, "-c", c} 399 logs, err := runCli(logCommand[0], logCommand[1:]) 400 if err != nil { 401 return sc, []string{fmt.Sprintf("Error retrieving logs for pod %s: %s.\nTry `%s`", po.Name, err, strings.Join(logCommand, " "))} 402 } 403 if strings.Contains(string(logs), execFmtError) { 404 sc = proto.StatusCode_STATUSCHECK_CONTAINER_EXEC_ERROR 405 } 406 output := strings.Split(string(logs), "\n") 407 // remove spurious empty lines (empty string or from trailing newline) 408 lines := make([]string, 0, len(output)) 409 for _, s := range output { 410 if s == "" { 411 continue 412 } 413 lines = append(lines, fmt.Sprintf("[%s %s] %s", po.Name, c, s)) 414 } 415 return sc, lines 416 } 417 418 func executeCLI(cmdName string, args []string) ([]byte, error) { 419 cmd := exec.Command(cmdName, args...) 420 return cmd.CombinedOutput() 421 } 422 423 func isPodOwnedBy(po v1.Pod, controller metav1.Object) bool { 424 if controller == nil { 425 return true 426 } 427 return metav1.IsControlledBy(&po, controller) 428 }