github.com/kubeflow/training-operator@v1.7.0/pkg/util/k8sutil/k8sutil.go (about) 1 // Copyright 2018 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package k8sutil 16 17 import ( 18 "net" 19 "os" 20 21 apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 22 log "github.com/sirupsen/logrus" 23 v1 "k8s.io/api/core/v1" 24 apierrors "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/client-go/kubernetes" 27 _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" // for gcp auth 28 "k8s.io/client-go/rest" 29 "k8s.io/client-go/tools/clientcmd" 30 ) 31 32 // RecommendedConfigPathEnvVar is a environment variable for path configuration 33 const RecommendedConfigPathEnvVar = "KUBECONFIG" 34 35 // MustNewKubeClient returns new kubernetes client for cluster configuration 36 func MustNewKubeClient() kubernetes.Interface { 37 cfg, err := GetClusterConfig() 38 if err != nil { 39 log.Fatal(err) 40 } 41 return kubernetes.NewForConfigOrDie(cfg) 42 } 43 44 // GetClusterConfig obtain the config from the Kube configuration used by kubeconfig, or from k8s cluster. 45 func GetClusterConfig() (*rest.Config, error) { 46 if len(os.Getenv(RecommendedConfigPathEnvVar)) > 0 { 47 // use the current context in kubeconfig 48 // This is very useful for running locally. 49 return clientcmd.BuildConfigFromFlags("", os.Getenv(RecommendedConfigPathEnvVar)) 50 } 51 52 // Work around https://github.com/kubernetes/kubernetes/issues/40973 53 // See https://github.com/coreos/etcd-operator/issues/731#issuecomment-283804819 54 if len(os.Getenv("KUBERNETES_SERVICE_HOST")) == 0 { 55 addrs, err := net.LookupHost("kubernetes.default.svc") 56 if err != nil { 57 panic(err) 58 } 59 if err := os.Setenv("KUBERNETES_SERVICE_HOST", addrs[0]); err != nil { 60 return nil, err 61 } 62 } 63 if len(os.Getenv("KUBERNETES_SERVICE_PORT")) == 0 { 64 if err := os.Setenv("KUBERNETES_SERVICE_PORT", "443"); err != nil { 65 panic(err) 66 } 67 } 68 return rest.InClusterConfig() 69 } 70 71 // IsKubernetesResourceAlreadyExistError throws error when kubernetes resources already exist. 72 func IsKubernetesResourceAlreadyExistError(err error) bool { 73 return apierrors.IsAlreadyExists(err) 74 } 75 76 // IsKubernetesResourceNotFoundError throws error when there is no kubernetes resource found. 77 func IsKubernetesResourceNotFoundError(err error) bool { 78 return apierrors.IsNotFound(err) 79 } 80 81 // TODO(jlewi): CascadeDeletOptions are part of garbage collection policy. 82 // CascadeDeleteOptions deletes the workload after the grace period 83 // Do we want to use this? See 84 // https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/ 85 func CascadeDeleteOptions(gracePeriodSeconds int64) *metav1.DeleteOptions { 86 return &metav1.DeleteOptions{ 87 GracePeriodSeconds: func(t int64) *int64 { return &t }(gracePeriodSeconds), 88 PropagationPolicy: func() *metav1.DeletionPropagation { 89 foreground := metav1.DeletePropagationForeground 90 return &foreground 91 }(), 92 } 93 } 94 95 // FilterActivePods returns pods that have not terminated. 96 func FilterActivePods(pods []*v1.Pod) []*v1.Pod { 97 var result []*v1.Pod 98 for _, p := range pods { 99 if IsPodActive(p) { 100 result = append(result, p) 101 } else { 102 log.Infof("Ignoring inactive pod %v/%v in state %v, deletion time %v", 103 p.Namespace, p.Name, p.Status.Phase, p.DeletionTimestamp) 104 } 105 } 106 return result 107 } 108 109 func IsPodActive(p *v1.Pod) bool { 110 return v1.PodSucceeded != p.Status.Phase && 111 v1.PodFailed != p.Status.Phase && 112 p.DeletionTimestamp == nil 113 } 114 115 // filterPodCount returns pods based on their phase. 116 func FilterPodCount(pods []*v1.Pod, phase v1.PodPhase) int32 { 117 var result int32 118 for i := range pods { 119 if phase == pods[i].Status.Phase { 120 result++ 121 } 122 } 123 return result 124 } 125 126 func GetTotalReplicas(replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) int32 { 127 jobReplicas := int32(0) 128 for _, r := range replicas { 129 if r.Replicas != nil { 130 jobReplicas += *r.Replicas 131 } else { 132 // If unspecified, defaults to 1. 133 jobReplicas += 1 134 } 135 } 136 return jobReplicas 137 } 138 139 func GetTotalFailedReplicas(replicas map[apiv1.ReplicaType]*apiv1.ReplicaStatus) int32 { 140 totalFailedReplicas := int32(0) 141 for _, status := range replicas { 142 totalFailedReplicas += status.Failed 143 } 144 return totalFailedReplicas 145 }