github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/agent-healthz/helper/evict.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package helper 18 19 import ( 20 "context" 21 "time" 22 23 corev1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 utilerrors "k8s.io/apimachinery/pkg/util/errors" 27 "k8s.io/apimachinery/pkg/util/sets" 28 "k8s.io/apimachinery/pkg/util/wait" 29 corelisters "k8s.io/client-go/listers/core/v1" 30 "k8s.io/client-go/tools/cache" 31 "k8s.io/klog/v2" 32 "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler" 33 34 listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1" 35 "github.com/kubewharf/katalyst-core/pkg/client/control" 36 "github.com/kubewharf/katalyst-core/pkg/metrics" 37 ) 38 39 const metricsNameEvictedReclaimedPodCount = "evicted_reclaimed_pod_count" 40 41 // EvictItem records the detailed item to perform pod eviction 42 type EvictItem struct { 43 // PodKeys maps from agent-name to pod-keys (that should be evicted because of the Agents) 44 PodKeys map[string][]string 45 } 46 47 type EvictHelper struct { 48 ctx context.Context 49 emitter metrics.MetricEmitter 50 podControl control.PodEjector 51 52 checker *HealthzHelper 53 queue *scheduler.RateLimitedTimedQueue 54 55 podLister corelisters.PodLister 56 nodeLister corelisters.NodeLister 57 cnrLister listers.CustomNodeResourceLister 58 } 59 60 // NewEvictHelper todo add logic here 61 func NewEvictHelper(ctx context.Context, emitter metrics.MetricEmitter, 62 podControl control.PodEjector, nodeLister corelisters.NodeLister, cnrLister listers.CustomNodeResourceLister, 63 queue *scheduler.RateLimitedTimedQueue, checker *HealthzHelper, 64 ) *EvictHelper { 65 return &EvictHelper{ 66 ctx: ctx, 67 emitter: emitter, 68 podControl: podControl, 69 70 queue: queue, 71 checker: checker, 72 73 nodeLister: nodeLister, 74 cnrLister: cnrLister, 75 } 76 } 77 78 func (e *EvictHelper) Run() { 79 go wait.Until(e.doEviction, scheduler.NodeEvictionPeriod, e.ctx.Done()) 80 } 81 82 // doEviction is used to pop nodes from to-be-evicted queue, 83 // and then trigger the taint actions 84 func (e *EvictHelper) doEviction() { 85 e.queue.Try(func(value scheduler.TimedValue) (bool, time.Duration) { 86 node, err := e.nodeLister.Get(value.Value) 87 if errors.IsNotFound(err) { 88 klog.Warningf("Node %v no longer present in nodeLister", value.Value) 89 return true, 0 90 } else if err != nil { 91 klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) 92 // retry in 50 millisecond 93 return false, 50 * time.Millisecond 94 } 95 96 // second confirm that we should evict reclaimed pods 97 keys := sets.NewString() 98 item := value.UID.(*EvictItem) 99 for agent, names := range item.PodKeys { 100 if !e.checker.CheckAgentReady(node.Name, agent) { 101 keys.Insert(names...) 102 } 103 } 104 105 if err := e.evictPods(node, keys.List()); err != nil { 106 klog.Warningf("failed to evict pods for cnr %v: %v", value.Value, err) 107 return true, 5 * time.Second 108 } 109 return true, 0 110 }) 111 } 112 113 // evictPods must filter out those pods that should be managed 114 // todo evict pods in with concurrency if necessary 115 func (e *EvictHelper) evictPods(node *corev1.Node, keys []string) error { 116 var errList []error 117 for _, key := range keys { 118 namespace, name, err := cache.SplitMetaNamespaceKey(key) 119 if err != nil { 120 klog.Errorf("failed to split namespace and name from key %s", key) 121 continue 122 } 123 124 delErr := e.podControl.DeletePod(e.ctx, namespace, name, metav1.DeleteOptions{}) 125 if delErr != nil { 126 _ = e.emitter.StoreInt64(metricsNameEvictedReclaimedPodCount, 1, metrics.MetricTypeNameCount, 127 []metrics.MetricTag{ 128 {Key: "status", Val: "failed"}, 129 {Key: "name", Val: node.Name}, 130 }...) 131 errList = append(errList, delErr) 132 continue 133 } 134 135 _ = e.emitter.StoreInt64(metricsNameEvictedReclaimedPodCount, 1, metrics.MetricTypeNameCount, 136 []metrics.MetricTag{ 137 {Key: "status", Val: "success"}, 138 {Key: "name", Val: node.Name}, 139 }...) 140 } 141 if len(errList) > 0 { 142 return utilerrors.NewAggregate(errList) 143 } 144 145 return nil 146 }