github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/agent-healthz/helper/taint_cnr.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package helper 18 19 import ( 20 "context" 21 "time" 22 23 corev1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/api/equality" 25 "k8s.io/apimachinery/pkg/api/errors" 26 "k8s.io/apimachinery/pkg/util/wait" 27 corelisters "k8s.io/client-go/listers/core/v1" 28 "k8s.io/klog/v2" 29 "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler" 30 31 apis "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" 32 listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1" 33 "github.com/kubewharf/katalyst-core/pkg/client/control" 34 "github.com/kubewharf/katalyst-core/pkg/metrics" 35 "github.com/kubewharf/katalyst-core/pkg/util" 36 "github.com/kubewharf/katalyst-core/pkg/util/general" 37 ) 38 39 const ( 40 metricsNameUntaintedCNRCount = "untainted_cnr_count" 41 metricsNameTaintedCNRCount = "tainted_cnr_count" 42 ) 43 44 const TaintNameNoScheduler = "TaintNameNoScheduler" 45 46 var TaintNoScheduler = &apis.Taint{ 47 Key: corev1.TaintNodeUnschedulable, 48 Effect: apis.TaintEffectNoScheduleForReclaimedTasks, 49 } 50 51 var allTaints = []*apis.Taint{ 52 TaintNoScheduler, 53 } 54 55 // CNRTaintItem records the detailed item to perform cnr-taints 56 type CNRTaintItem struct { 57 Taints map[string]*apis.Taint 58 } 59 60 type CNRTaintHelper struct { 61 ctx context.Context 62 emitter metrics.MetricEmitter 63 cnrControl control.CNRControl 64 65 checker *HealthzHelper 66 queue *scheduler.RateLimitedTimedQueue 67 68 nodeLister corelisters.NodeLister 69 cnrLister listers.CustomNodeResourceLister 70 } 71 72 // NewTaintHelper todo add logic here 73 func NewTaintHelper(ctx context.Context, emitter metrics.MetricEmitter, cnrControl control.CNRControl, 74 nodeLister corelisters.NodeLister, cnrLister listers.CustomNodeResourceLister, 75 queue *scheduler.RateLimitedTimedQueue, checker *HealthzHelper, 76 ) *CNRTaintHelper { 77 return &CNRTaintHelper{ 78 ctx: ctx, 79 emitter: emitter, 80 cnrControl: cnrControl, 81 82 checker: checker, 83 queue: queue, 84 85 nodeLister: nodeLister, 86 cnrLister: cnrLister, 87 } 88 } 89 90 func (t *CNRTaintHelper) Run() { 91 go wait.Until(t.doTaint, scheduler.NodeEvictionPeriod, t.ctx.Done()) 92 } 93 94 // doTaint is used to pop nodes from to-be-tainted queue, 95 // and then trigger the taint actions 96 func (t *CNRTaintHelper) doTaint() { 97 t.queue.Try(func(value scheduler.TimedValue) (bool, time.Duration) { 98 node := value.Value 99 100 cnr, err := t.cnrLister.Get(value.Value) 101 if errors.IsNotFound(err) { 102 klog.Warningf("cnr %v no longer present in cnrLister", value.Value) 103 return true, 0 104 } else if err != nil { 105 klog.Errorf("cannot find cnr for node %v err %v", node, err) 106 // retry in 50 millisecond 107 return false, 50 * time.Millisecond 108 } 109 110 // second confirm that we should taint cnr 111 item := value.UID.(*CNRTaintItem) 112 needTaint := t.checker.CheckAllAgentReady(node) 113 if needTaint && len(item.Taints) != 0 { 114 if err := t.taintCNR(cnr, item); err != nil { 115 klog.Warningf("failed to taint for cnr %v: %v", value.Value, err) 116 return false, 0 117 } 118 } 119 120 return true, 0 121 }) 122 } 123 124 func (t *CNRTaintHelper) taintCNR(cnr *apis.CustomNodeResource, item *CNRTaintItem) error { 125 var err error 126 var newCNR *apis.CustomNodeResource 127 for _, taint := range item.Taints { 128 newCNR, _, err = util.AddOrUpdateCNRTaint(cnr, taint) 129 if err != nil { 130 return err 131 } 132 } 133 134 if equality.Semantic.DeepEqual(cnr, newCNR) { 135 general.Infof("taint already exits, not need to update") 136 return nil 137 } 138 139 _, err = t.cnrControl.PatchCNRSpecAndMetadata(t.ctx, cnr.Name, cnr, newCNR) 140 if err != nil { 141 _ = t.emitter.StoreInt64(metricsNameTaintedCNRCount, 1, metrics.MetricTypeNameCount, 142 []metrics.MetricTag{ 143 {Key: "status", Val: "failed"}, 144 {Key: "name", Val: cnr.Name}, 145 }...) 146 return err 147 } 148 _ = t.emitter.StoreInt64(metricsNameTaintedCNRCount, 1, metrics.MetricTypeNameCount, 149 []metrics.MetricTag{ 150 {Key: "status", Val: "success"}, 151 {Key: "name", Val: cnr.Name}, 152 }...) 153 154 return nil 155 } 156 157 // TryUNTaintCNR is used to delete taint info from CNR 158 func (t *CNRTaintHelper) TryUNTaintCNR(name string) error { 159 cnr, err := t.cnrLister.Get(name) 160 if errors.IsNotFound(err) { 161 klog.Warningf("cnr %v no longer present in cnrLister", name) 162 return nil 163 } else if err != nil { 164 return err 165 } 166 167 var newCNR *apis.CustomNodeResource 168 for _, taint := range allTaints { 169 newCNR, _, err = util.RemoveCNRTaint(cnr, taint) 170 if err != nil { 171 return err 172 } 173 } 174 175 if equality.Semantic.DeepEqual(cnr, newCNR) { 176 klog.V(5).InfoS("taint already disappears, not need to update", "cnr", cnr.Name) 177 return nil 178 } 179 180 _, err = t.cnrControl.PatchCNRSpecAndMetadata(t.ctx, cnr.Name, cnr, newCNR) 181 if err != nil { 182 _ = t.emitter.StoreInt64(metricsNameUntaintedCNRCount, 1, metrics.MetricTypeNameCount, 183 []metrics.MetricTag{ 184 {Key: "status", Val: "failed"}, 185 {Key: "name", Val: cnr.Name}, 186 }...) 187 return err 188 } 189 _ = t.emitter.StoreInt64(metricsNameUntaintedCNRCount, 1, metrics.MetricTypeNameCount, 190 []metrics.MetricTag{ 191 {Key: "status", Val: "success"}, 192 {Key: "name", Val: cnr.Name}, 193 }...) 194 195 return nil 196 }