volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/util/nodelock/nodelock.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package nodelock 18 19 import ( 20 "context" 21 "fmt" 22 "os" 23 "path/filepath" 24 "time" 25 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/client-go/kubernetes" 28 "k8s.io/client-go/rest" 29 "k8s.io/client-go/tools/clientcmd" 30 "k8s.io/klog/v2" 31 ) 32 33 const MaxLockRetry = 5 34 35 var kubeClient kubernetes.Interface 36 37 func GetClient() kubernetes.Interface { 38 return kubeClient 39 } 40 41 // NewClient connects to an API server 42 func NewClient() (kubernetes.Interface, error) { 43 kubeConfig := os.Getenv("KUBECONFIG") 44 if kubeConfig == "" { 45 kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config") 46 } 47 config, err := rest.InClusterConfig() 48 if err != nil { 49 config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) 50 if err != nil { 51 return nil, err 52 } 53 } 54 client, err := kubernetes.NewForConfig(config) 55 kubeClient = client 56 return client, err 57 } 58 59 // UseClient uses an existing client 60 func UseClient(client kubernetes.Interface) error { 61 kubeClient = client 62 return nil 63 } 64 65 func setNodeLock(nodeName string, lockName string) error { 66 ctx := context.Background() 67 node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 68 if err != nil { 69 klog.Errorln("get node failed", err.Error()) 70 return err 71 } 72 if _, ok := node.ObjectMeta.Annotations[lockName]; ok { 73 klog.V(3).Infof("node %s is locked", nodeName) 74 return fmt.Errorf("node %s is locked", nodeName) 75 } 76 newNode := node.DeepCopy() 77 newNode.ObjectMeta.Annotations[lockName] = time.Now().Format(time.RFC3339) 78 _, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) 79 for i := 0; i < MaxLockRetry && err != nil; i++ { 80 klog.ErrorS(err, "Failed to update node", "node", nodeName, "retry", i) 81 time.Sleep(100 * time.Millisecond) 82 node, err = kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 83 if err != nil { 84 klog.ErrorS(err, "Failed to get node when retry to update", "node", nodeName) 85 continue 86 } 87 newNode := node.DeepCopy() 88 newNode.ObjectMeta.Annotations[lockName] = time.Now().Format(time.RFC3339) 89 _, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) 90 } 91 if err != nil { 92 return fmt.Errorf("setNodeLock exceeds retry count %d", MaxLockRetry) 93 } 94 klog.InfoS("Node lock set", "node", nodeName) 95 return nil 96 } 97 98 // ReleaseNodeLock release a certain device lock on a certain node 99 func ReleaseNodeLock(nodeName string, lockName string) error { 100 ctx := context.Background() 101 node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 102 if err != nil { 103 return err 104 } 105 if _, ok := node.ObjectMeta.Annotations[lockName]; !ok { 106 klog.V(3).InfoS("Node lock not set", "node", nodeName) 107 return nil 108 } 109 newNode := node.DeepCopy() 110 delete(newNode.ObjectMeta.Annotations, lockName) 111 _, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) 112 for i := 0; i < MaxLockRetry && err != nil; i++ { 113 klog.ErrorS(err, "Failed to update node", "node", nodeName, "retry", i) 114 time.Sleep(100 * time.Millisecond) 115 node, err = kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 116 if err != nil { 117 klog.ErrorS(err, "Failed to get node when retry to update", "node", nodeName) 118 continue 119 } 120 newNode := node.DeepCopy() 121 delete(newNode.ObjectMeta.Annotations, lockName) 122 _, err = kubeClient.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) 123 } 124 if err != nil { 125 return fmt.Errorf("releaseNodeLock exceeds retry count %d", MaxLockRetry) 126 } 127 klog.InfoS("Node lock released", "node", nodeName) 128 return nil 129 } 130 131 // LockNode try lock device 'lockName' on node 'nodeName' 132 func LockNode(nodeName string, lockName string) error { 133 ctx := context.Background() 134 node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 135 if err != nil { 136 return err 137 } 138 if _, ok := node.ObjectMeta.Annotations[lockName]; !ok { 139 return setNodeLock(nodeName, lockName) 140 } 141 lockTime, err := time.Parse(time.RFC3339, node.ObjectMeta.Annotations[lockName]) 142 if err != nil { 143 return err 144 } 145 if time.Since(lockTime) > time.Minute*5 { 146 klog.V(3).InfoS("Node lock expired", "node", nodeName, "lockTime", lockTime) 147 err = ReleaseNodeLock(nodeName, lockName) 148 if err != nil { 149 klog.ErrorS(err, "Failed to release node lock", "node", nodeName) 150 return err 151 } 152 return setNodeLock(nodeName, lockName) 153 } 154 return fmt.Errorf("node %s has been locked within 5 minutes", nodeName) 155 }