github.com/kubeflow/training-operator@v1.7.0/pkg/util/k8sutil/k8sutil.go (about)

     1  // Copyright 2018 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package k8sutil
    16  
    17  import (
    18  	"net"
    19  	"os"
    20  
    21  	apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    22  	log "github.com/sirupsen/logrus"
    23  	v1 "k8s.io/api/core/v1"
    24  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/client-go/kubernetes"
    27  	_ "k8s.io/client-go/plugin/pkg/client/auth/gcp" // for gcp auth
    28  	"k8s.io/client-go/rest"
    29  	"k8s.io/client-go/tools/clientcmd"
    30  )
    31  
    32  // RecommendedConfigPathEnvVar is a environment variable for path configuration
    33  const RecommendedConfigPathEnvVar = "KUBECONFIG"
    34  
    35  // MustNewKubeClient returns new kubernetes client for cluster configuration
    36  func MustNewKubeClient() kubernetes.Interface {
    37  	cfg, err := GetClusterConfig()
    38  	if err != nil {
    39  		log.Fatal(err)
    40  	}
    41  	return kubernetes.NewForConfigOrDie(cfg)
    42  }
    43  
    44  // GetClusterConfig obtain the config from the Kube configuration used by kubeconfig, or from k8s cluster.
    45  func GetClusterConfig() (*rest.Config, error) {
    46  	if len(os.Getenv(RecommendedConfigPathEnvVar)) > 0 {
    47  		// use the current context in kubeconfig
    48  		// This is very useful for running locally.
    49  		return clientcmd.BuildConfigFromFlags("", os.Getenv(RecommendedConfigPathEnvVar))
    50  	}
    51  
    52  	// Work around https://github.com/kubernetes/kubernetes/issues/40973
    53  	// See https://github.com/coreos/etcd-operator/issues/731#issuecomment-283804819
    54  	if len(os.Getenv("KUBERNETES_SERVICE_HOST")) == 0 {
    55  		addrs, err := net.LookupHost("kubernetes.default.svc")
    56  		if err != nil {
    57  			panic(err)
    58  		}
    59  		if err := os.Setenv("KUBERNETES_SERVICE_HOST", addrs[0]); err != nil {
    60  			return nil, err
    61  		}
    62  	}
    63  	if len(os.Getenv("KUBERNETES_SERVICE_PORT")) == 0 {
    64  		if err := os.Setenv("KUBERNETES_SERVICE_PORT", "443"); err != nil {
    65  			panic(err)
    66  		}
    67  	}
    68  	return rest.InClusterConfig()
    69  }
    70  
    71  // IsKubernetesResourceAlreadyExistError throws error when kubernetes resources already exist.
    72  func IsKubernetesResourceAlreadyExistError(err error) bool {
    73  	return apierrors.IsAlreadyExists(err)
    74  }
    75  
    76  // IsKubernetesResourceNotFoundError throws error when there is no kubernetes resource found.
    77  func IsKubernetesResourceNotFoundError(err error) bool {
    78  	return apierrors.IsNotFound(err)
    79  }
    80  
    81  // TODO(jlewi): CascadeDeletOptions are part of garbage collection policy.
    82  // CascadeDeleteOptions deletes the workload after the grace period
    83  // Do we want to use this? See
    84  // https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/
    85  func CascadeDeleteOptions(gracePeriodSeconds int64) *metav1.DeleteOptions {
    86  	return &metav1.DeleteOptions{
    87  		GracePeriodSeconds: func(t int64) *int64 { return &t }(gracePeriodSeconds),
    88  		PropagationPolicy: func() *metav1.DeletionPropagation {
    89  			foreground := metav1.DeletePropagationForeground
    90  			return &foreground
    91  		}(),
    92  	}
    93  }
    94  
    95  // FilterActivePods returns pods that have not terminated.
    96  func FilterActivePods(pods []*v1.Pod) []*v1.Pod {
    97  	var result []*v1.Pod
    98  	for _, p := range pods {
    99  		if IsPodActive(p) {
   100  			result = append(result, p)
   101  		} else {
   102  			log.Infof("Ignoring inactive pod %v/%v in state %v, deletion time %v",
   103  				p.Namespace, p.Name, p.Status.Phase, p.DeletionTimestamp)
   104  		}
   105  	}
   106  	return result
   107  }
   108  
   109  func IsPodActive(p *v1.Pod) bool {
   110  	return v1.PodSucceeded != p.Status.Phase &&
   111  		v1.PodFailed != p.Status.Phase &&
   112  		p.DeletionTimestamp == nil
   113  }
   114  
   115  // filterPodCount returns pods based on their phase.
   116  func FilterPodCount(pods []*v1.Pod, phase v1.PodPhase) int32 {
   117  	var result int32
   118  	for i := range pods {
   119  		if phase == pods[i].Status.Phase {
   120  			result++
   121  		}
   122  	}
   123  	return result
   124  }
   125  
   126  func GetTotalReplicas(replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) int32 {
   127  	jobReplicas := int32(0)
   128  	for _, r := range replicas {
   129  		if r.Replicas != nil {
   130  			jobReplicas += *r.Replicas
   131  		} else {
   132  			// If unspecified, defaults to 1.
   133  			jobReplicas += 1
   134  		}
   135  	}
   136  	return jobReplicas
   137  }
   138  
   139  func GetTotalFailedReplicas(replicas map[apiv1.ReplicaType]*apiv1.ReplicaStatus) int32 {
   140  	totalFailedReplicas := int32(0)
   141  	for _, status := range replicas {
   142  		totalFailedReplicas += status.Failed
   143  	}
   144  	return totalFailedReplicas
   145  }