volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/util/nodelock/nodelock.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package nodelock
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"path/filepath"
    24  	"time"
    25  
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/client-go/kubernetes"
    28  	"k8s.io/client-go/rest"
    29  	"k8s.io/client-go/tools/clientcmd"
    30  	"k8s.io/klog/v2"
    31  )
    32  
    33  const MaxLockRetry = 5
    34  
    35  var kubeClient kubernetes.Interface
    36  
    37  func GetClient() kubernetes.Interface {
    38  	return kubeClient
    39  }
    40  
    41  // NewClient connects to an API server
    42  func NewClient() (kubernetes.Interface, error) {
    43  	kubeConfig := os.Getenv("KUBECONFIG")
    44  	if kubeConfig == "" {
    45  		kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config")
    46  	}
    47  	config, err := rest.InClusterConfig()
    48  	if err != nil {
    49  		config, err = clientcmd.BuildConfigFromFlags("", kubeConfig)
    50  		if err != nil {
    51  			return nil, err
    52  		}
    53  	}
    54  	client, err := kubernetes.NewForConfig(config)
    55  	kubeClient = client
    56  	return client, err
    57  }
    58  
    59  // UseClient uses an existing client
    60  func UseClient(client kubernetes.Interface) error {
    61  	kubeClient = client
    62  	return nil
    63  }
    64  
    65  func setNodeLock(nodeName string, lockName string) error {
    66  	ctx := context.Background()
    67  	node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
    68  	if err != nil {
    69  		klog.Errorln("get node failed", err.Error())
    70  		return err
    71  	}
    72  	if _, ok := node.ObjectMeta.Annotations[lockName]; ok {
    73  		klog.V(3).Infof("node %s is locked", nodeName)
    74  		return fmt.Errorf("node %s is locked", nodeName)
    75  	}
    76  	newNode := node.DeepCopy()
    77  	newNode.ObjectMeta.Annotations[lockName] = time.Now().Format(time.RFC3339)
    78  	_, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{})
    79  	for i := 0; i < MaxLockRetry && err != nil; i++ {
    80  		klog.ErrorS(err, "Failed to update node", "node", nodeName, "retry", i)
    81  		time.Sleep(100 * time.Millisecond)
    82  		node, err = kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
    83  		if err != nil {
    84  			klog.ErrorS(err, "Failed to get node when retry to update", "node", nodeName)
    85  			continue
    86  		}
    87  		newNode := node.DeepCopy()
    88  		newNode.ObjectMeta.Annotations[lockName] = time.Now().Format(time.RFC3339)
    89  		_, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{})
    90  	}
    91  	if err != nil {
    92  		return fmt.Errorf("setNodeLock exceeds retry count %d", MaxLockRetry)
    93  	}
    94  	klog.InfoS("Node lock set", "node", nodeName)
    95  	return nil
    96  }
    97  
    98  // ReleaseNodeLock release a certain device lock on a certain node
    99  func ReleaseNodeLock(nodeName string, lockName string) error {
   100  	ctx := context.Background()
   101  	node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
   102  	if err != nil {
   103  		return err
   104  	}
   105  	if _, ok := node.ObjectMeta.Annotations[lockName]; !ok {
   106  		klog.V(3).InfoS("Node lock not set", "node", nodeName)
   107  		return nil
   108  	}
   109  	newNode := node.DeepCopy()
   110  	delete(newNode.ObjectMeta.Annotations, lockName)
   111  	_, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{})
   112  	for i := 0; i < MaxLockRetry && err != nil; i++ {
   113  		klog.ErrorS(err, "Failed to update node", "node", nodeName, "retry", i)
   114  		time.Sleep(100 * time.Millisecond)
   115  		node, err = kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
   116  		if err != nil {
   117  			klog.ErrorS(err, "Failed to get node when retry to update", "node", nodeName)
   118  			continue
   119  		}
   120  		newNode := node.DeepCopy()
   121  		delete(newNode.ObjectMeta.Annotations, lockName)
   122  		_, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{})
   123  	}
   124  	if err != nil {
   125  		return fmt.Errorf("releaseNodeLock exceeds retry count %d", MaxLockRetry)
   126  	}
   127  	klog.InfoS("Node lock released", "node", nodeName)
   128  	return nil
   129  }
   130  
   131  // LockNode try lock device 'lockName' on node 'nodeName'
   132  func LockNode(nodeName string, lockName string) error {
   133  	ctx := context.Background()
   134  	node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
   135  	if err != nil {
   136  		return err
   137  	}
   138  	if _, ok := node.ObjectMeta.Annotations[lockName]; !ok {
   139  		return setNodeLock(nodeName, lockName)
   140  	}
   141  	lockTime, err := time.Parse(time.RFC3339, node.ObjectMeta.Annotations[lockName])
   142  	if err != nil {
   143  		return err
   144  	}
   145  	if time.Since(lockTime) > time.Minute*5 {
   146  		klog.V(3).InfoS("Node lock expired", "node", nodeName, "lockTime", lockTime)
   147  		err = ReleaseNodeLock(nodeName, lockName)
   148  		if err != nil {
   149  			klog.ErrorS(err, "Failed to release node lock", "node", nodeName)
   150  			return err
   151  		}
   152  		return setNodeLock(nodeName, lockName)
   153  	}
   154  	return fmt.Errorf("node %s has been locked within 5 minutes", nodeName)
   155  }