github.com/m-lab/locate@v0.17.6/cmd/heartbeat/health/kubernetes-client.go (about)

     1  package health
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"net/url"
     7  	"path"
     8  	"strconv"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/m-lab/go/rtx"
    13  	"github.com/m-lab/locate/metrics"
    14  	v1 "k8s.io/api/core/v1"
    15  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    16  	"k8s.io/client-go/kubernetes"
    17  	"k8s.io/client-go/tools/clientcmd"
    18  	"k8s.io/client-go/tools/clientcmd/api"
    19  )
    20  
    21  var errKubernetesAPI = "error making request to Kubernetes API server"
    22  
    23  // KubernetesClient manages requests to the Kubernetes API server.
    24  type KubernetesClient struct {
    25  	pod       string
    26  	node      string
    27  	namespace string
    28  	clientset kubernetes.Interface
    29  }
    30  
    31  // MustNewKubernetesClient creates a new KubenernetesClient instance.
    32  // If the client cannot be instantiated, the function will exit.
    33  func MustNewKubernetesClient(url *url.URL, pod, node, namespace, auth string) *KubernetesClient {
    34  	defConfig := getDefaultClientConfig(url, auth)
    35  	restConfig, err := defConfig.ClientConfig()
    36  	rtx.Must(err, "failed to create kubernetes config")
    37  
    38  	clientset, err := kubernetes.NewForConfig(restConfig)
    39  	rtx.Must(err, "failed to create kubernetes clientset")
    40  
    41  	client := &KubernetesClient{
    42  		pod:       pod,
    43  		node:      node,
    44  		namespace: namespace,
    45  		clientset: clientset,
    46  	}
    47  	return client
    48  }
    49  
    50  func getDefaultClientConfig(url *url.URL, auth string) clientcmd.ClientConfig {
    51  	// This is a low-level structure normally created from parsing a kubeconfig
    52  	// file.  Since we know all values we can create the client object directly.
    53  	//
    54  	// The cluster and user names serve only to define a context that
    55  	// associates login credentials with a specific cluster.
    56  	clusterClient := api.Config{
    57  		Clusters: map[string]*api.Cluster{
    58  			// Define the cluster address and CA Certificate.
    59  			"cluster": {
    60  				Server:                url.String(),
    61  				InsecureSkipTLSVerify: false, // Require a valid CA Certificate.
    62  				CertificateAuthority:  path.Join(auth, "ca.crt"),
    63  			},
    64  		},
    65  		AuthInfos: map[string]*api.AuthInfo{
    66  			// Define the user credentials for access to the API.
    67  			"user": {
    68  				TokenFile: path.Join(auth, "token"),
    69  			},
    70  		},
    71  		Contexts: map[string]*api.Context{
    72  			// Define a context that refers to the above cluster and user.
    73  			"cluster-user": {
    74  				Cluster:  "cluster",
    75  				AuthInfo: "user",
    76  			},
    77  		},
    78  		// Use the above context.
    79  		CurrentContext: "cluster-user",
    80  	}
    81  
    82  	defConfig := clientcmd.NewDefaultClientConfig(
    83  		clusterClient,
    84  		&clientcmd.ConfigOverrides{
    85  			ClusterInfo: api.Cluster{Server: ""},
    86  		},
    87  	)
    88  
    89  	return defConfig
    90  }
    91  
    92  // isHealthy returns true if it can determine the following conditions are true:
    93  //   - The Pod's status is "Running"
    94  //   - The Node's Ready condition is "True"
    95  //   - The Node does not have a "lame-duck" taint
    96  //
    97  // OR if it cannot contact the API Server to make a determination.
    98  func (c *KubernetesClient) isHealthy(ctx context.Context) bool {
    99  	start := time.Now()
   100  	isHealthy := c.isPodRunning(ctx) && c.isNodeReady(ctx)
   101  	metrics.KubernetesRequestTimeHistogram.WithLabelValues(strconv.FormatBool(isHealthy)).Observe(time.Since(start).Seconds())
   102  	return isHealthy
   103  }
   104  
   105  func (c *KubernetesClient) isPodRunning(ctx context.Context) bool {
   106  	pod, err := c.clientset.CoreV1().Pods(c.namespace).Get(ctx, c.pod, metav1.GetOptions{})
   107  	if err != nil {
   108  		log.Printf("%s: %v", errKubernetesAPI, err)
   109  		metrics.KubernetesRequestsTotal.WithLabelValues("pod", extractError(err)).Inc()
   110  		return true
   111  	}
   112  
   113  	metrics.KubernetesRequestsTotal.WithLabelValues("pod", "OK").Inc()
   114  	return pod.Status.Phase == "Running"
   115  }
   116  
   117  // isNodeReady returns true if it can determine the following conditions are true:
   118  //   - The Node's Ready condition is "True"
   119  //   - The Node does not have a "lame-duck" taint
   120  //
   121  // OR if it cannot contact the API Server to make a determination.
   122  func (c *KubernetesClient) isNodeReady(ctx context.Context) bool {
   123  	node, err := c.clientset.CoreV1().Nodes().Get(ctx, c.node, metav1.GetOptions{})
   124  	if err != nil {
   125  		log.Printf("%s: %v", errKubernetesAPI, err)
   126  		metrics.KubernetesRequestsTotal.WithLabelValues("node", extractError(err)).Inc()
   127  		return true
   128  	}
   129  
   130  	metrics.KubernetesRequestsTotal.WithLabelValues("node", "OK").Inc()
   131  	for _, condition := range node.Status.Conditions {
   132  		if condition.Type == "Ready" && condition.Status == "True" {
   133  			return !isInMaintenance(node)
   134  		}
   135  	}
   136  
   137  	return false
   138  }
   139  
   140  func isInMaintenance(node *v1.Node) bool {
   141  	for _, taint := range node.Spec.Taints {
   142  		if taint.Key == "lame-duck" {
   143  			return true
   144  		}
   145  	}
   146  
   147  	return false
   148  }
   149  
   150  // extractError extracts the base error string from the error returned by the
   151  // the Kubernetes API.
   152  func extractError(err error) string {
   153  	parts := strings.Split(err.Error(), ": ")
   154  	return parts[len(parts)-1]
   155  }