github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/evictionmanager/podkiller/podkiller.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package podkiller 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "strconv" 24 "strings" 25 "sync" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 "k8s.io/apimachinery/pkg/util/errors" 32 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 33 "k8s.io/apimachinery/pkg/util/wait" 34 "k8s.io/client-go/kubernetes" 35 "k8s.io/client-go/util/workqueue" 36 "k8s.io/klog/v2" 37 38 pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1" 39 "github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/rule" 40 "github.com/kubewharf/katalyst-core/pkg/consts" 41 ) 42 43 // PodKiller implements the killing actions for given pods. 44 type PodKiller interface { 45 // Name returns name as identifier for a specific Killer. 46 Name() string 47 48 // Start pod killer logic, prepare to receive on-killing pods. 49 Start(ctx context.Context) 50 51 // EvictPods send on-killing pods to pod killer. 52 EvictPods(rpList rule.RuledEvictPodList) error 53 54 // EvictPod a pod with the specified grace period. 55 EvictPod(rp *rule.RuledEvictPod) error 56 } 57 58 // DummyPodKiller is a stub implementation for Killer interface. 59 type DummyPodKiller struct{} 60 61 func (d DummyPodKiller) Name() string { return "dummy-pod-killer" } 62 func (d DummyPodKiller) Start(_ context.Context) {} 63 func (d DummyPodKiller) EvictPods(rule.RuledEvictPodList) error { return nil } 64 func (d DummyPodKiller) EvictPod(*rule.RuledEvictPod) error { return nil } 65 66 var _ PodKiller = DummyPodKiller{} 67 68 // SynchronizedPodKiller trigger killing actions immediately after 69 // receiving killing requests; only returns true if all pods are 70 // successfully evicted. 71 type SynchronizedPodKiller struct { 72 killer Killer 73 } 74 75 func NewSynchronizedPodKiller(killer Killer) PodKiller { 76 return &SynchronizedPodKiller{ 77 killer: killer, 78 } 79 } 80 81 func (s *SynchronizedPodKiller) Name() string { return "synchronized-pod-killer" } 82 83 func (s *SynchronizedPodKiller) Start(_ context.Context) { 84 klog.Infof("[synchronized] pod-killer run with killer %v", s.killer.Name()) 85 defer klog.Infof("[synchronized] pod-killer started") 86 } 87 88 func (s *SynchronizedPodKiller) EvictPod(rp *rule.RuledEvictPod) error { 89 if rp == nil || rp.Pod == nil { 90 return fmt.Errorf("EvictPod got nil pod") 91 } 92 93 gracePeriod, err := getGracefulDeletionPeriod(rp.Pod, rp.DeletionOptions) 94 if err != nil { 95 return fmt.Errorf("getGracefulDeletionPeriod for pod: %s/%s failed with error: %v", rp.Pod.Namespace, rp.Pod.Name, err) 96 } 97 98 err = s.killer.Evict(context.Background(), rp.Pod, gracePeriod, rp.Reason, rp.EvictionPluginName) 99 if err != nil { 100 return fmt.Errorf("evict pod: %s/%s failed with error: %v", rp.Pod.Namespace, rp.Pod.Name, err) 101 } 102 103 return nil 104 } 105 106 func (s *SynchronizedPodKiller) EvictPods(rpList rule.RuledEvictPodList) error { 107 var errList []error 108 var mtx sync.Mutex 109 110 klog.Infof("[synchronized] pod-killer evict %d totally", len(rpList)) 111 syncNodeUtilizationAndAdjust := func(i int) { 112 err := s.EvictPod(rpList[i]) 113 114 mtx.Lock() 115 if err != nil { 116 errList = append(errList, err) 117 } 118 mtx.Unlock() 119 } 120 workqueue.ParallelizeUntil(context.Background(), 3, len(rpList), syncNodeUtilizationAndAdjust) 121 122 klog.Infof("[synchronized] successfully evict %d totally", len(rpList)-len(errList)) 123 return errors.NewAggregate(errList) 124 } 125 126 // AsynchronizedPodKiller pushed killing actions into a queue and 127 // returns true directly, another go routine will be responsible 128 // to perform killing actions instead. 129 type AsynchronizedPodKiller struct { 130 killer Killer 131 132 client kubernetes.Interface 133 134 // use map to act as a limited queue 135 queue workqueue.RateLimitingInterface 136 137 // processingPods is used to store pods that are being evicted 138 // the map is constructed as podName -> gracefulPeriod -> evictPodInfo 139 processingPods map[string]map[int64]*evictPodInfo 140 141 sync.RWMutex 142 } 143 144 type evictPodInfo struct { 145 Pod *v1.Pod 146 Reason string 147 Plugin string 148 } 149 150 func getEvictPodInfo(rp *rule.RuledEvictPod) *evictPodInfo { 151 return &evictPodInfo{ 152 Pod: rp.Pod.DeepCopy(), 153 Reason: rp.Reason, 154 Plugin: rp.EvictionPluginName, 155 } 156 } 157 158 func NewAsynchronizedPodKiller(killer Killer, client kubernetes.Interface) PodKiller { 159 a := &AsynchronizedPodKiller{ 160 killer: killer, 161 client: client, 162 processingPods: make(map[string]map[int64]*evictPodInfo), 163 } 164 a.queue = workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), a.Name()) 165 return a 166 } 167 168 func (a *AsynchronizedPodKiller) Name() string { return "asynchronous-pod-killer" } 169 170 func (a *AsynchronizedPodKiller) Start(ctx context.Context) { 171 klog.Infof("[asynchronous] pod-killer run with killer %v", a.killer.Name()) 172 defer klog.Infof("[asynchronous] pod-killer started") 173 174 for i := 0; i < 10; i++ { 175 go wait.Until(a.run, time.Second, ctx.Done()) 176 } 177 } 178 179 func (a *AsynchronizedPodKiller) EvictPods(rpList rule.RuledEvictPodList) error { 180 klog.Infof("[asynchronous] pod-killer evict %d totally", len(rpList)) 181 182 errList := make([]error, 0, len(rpList)) 183 for _, rp := range rpList { 184 err := a.EvictPod(rp) 185 if err != nil { 186 errList = append(errList, err) 187 } 188 } 189 190 klog.Infof("[asynchronous] successfully add %d pods to eviction queue", len(rpList)-len(errList)) 191 return errors.NewAggregate(errList) 192 } 193 194 func (a *AsynchronizedPodKiller) EvictPod(rp *rule.RuledEvictPod) error { 195 if rp == nil || rp.Pod == nil { 196 return fmt.Errorf("evictPod got nil pod") 197 } 198 199 gracePeriod, err := getGracefulDeletionPeriod(rp.Pod, rp.DeletionOptions) 200 if err != nil { 201 return fmt.Errorf("getGracefulDeletionPeriod for pod: %s/%s failed with error: %v", rp.Pod.Namespace, rp.Pod.Name, err) 202 } 203 podKey := podKeyFunc(rp.Pod.Namespace, rp.Pod.Name) 204 205 a.Lock() 206 if a.processingPods[podKey] != nil { 207 var minOne int64 = math.MaxInt64 208 for recordedGracePeriod := range a.processingPods[podKey] { 209 if recordedGracePeriod < minOne { 210 minOne = recordedGracePeriod 211 } 212 } 213 214 if gracePeriod >= minOne { 215 a.Unlock() 216 klog.Infof("[asynchronous] pod: %s/%s is being processed with smaller grace period, skip it", rp.Pod.Namespace, rp.Pod.Name) 217 return nil 218 } 219 } 220 221 if a.processingPods[podKey] == nil { 222 a.processingPods[podKey] = make(map[int64]*evictPodInfo) 223 } 224 225 a.processingPods[podKey][gracePeriod] = getEvictPodInfo(rp) 226 a.Unlock() 227 228 a.queue.AddRateLimited(evictionKeyFunc(podKey, gracePeriod)) 229 return nil 230 } 231 232 // run is a long-running function that will continually call the 233 // processNextItem function in order to read and process a message on the queue. 234 func (a *AsynchronizedPodKiller) run() { 235 for a.processNextItem() { 236 } 237 } 238 239 // processNextItem will read a single work item off the queue and 240 // attempt to process it, by calling the sync function. 241 func (a *AsynchronizedPodKiller) processNextItem() bool { 242 obj, shutdown := a.queue.Get() 243 if shutdown { 244 return false 245 } 246 247 // We wrap this block in a func so we can defer c.workqueue.Done. 248 err := func(obj interface{}) error { 249 // We call Done here so the workqueue knows we have finished 250 // processing this item. We also must remember to call Forget if we 251 // do not want this work item being re-queued. For example, we do 252 // not call Forget if a transient error occurs, instead the item is 253 // put back on the workqueue and attempted again after a back-off 254 // period. 255 defer a.queue.Done(obj) 256 var key string 257 var ok bool 258 // We expect strings to come off the workqueue. These are of the 259 // form namespace/name. We do this as the delayed nature of the 260 // workqueue means the items in the informer cache may actually be 261 // more up to date that when the item was initially put onto the 262 // workqueue. 263 if key, ok = obj.(string); !ok { 264 // As the item in the workqueue is actually invalid, we call 265 // Forget here else we'd go into a loop of attempting to 266 // process a work item that is invalid. 267 a.queue.Forget(obj) 268 utilruntime.HandleError(fmt.Errorf("expected string in workqueue but got %#v", obj)) 269 return nil 270 } 271 // Run the syncHandler, passing it the namespace/name string of the 272 // ExecDeploy resource to be synced. 273 if err, requeue := a.sync(key); err != nil { 274 // Put the item back on the workqueue to handle any transient errors. 275 klog.Warningf("[asynchronous] error syncing '%s': %s, requeuing", key, err.Error()) 276 277 if requeue { 278 a.queue.AddRateLimited(key) 279 } else { 280 a.queue.Forget(obj) 281 } 282 283 return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) 284 } 285 // Finally, if no error occurs we Forget this item so it does not 286 // get queued again until another change happens. 287 a.queue.Forget(obj) 288 return nil 289 }(obj) 290 if err != nil { 291 utilruntime.HandleError(err) 292 return true 293 } 294 295 return true 296 } 297 298 func (a *AsynchronizedPodKiller) sync(key string) (retError error, requeue bool) { 299 namespace, name, gracePeriodSeconds, err := splitEvictionKey(key) 300 if err != nil { 301 return fmt.Errorf("[asynchronous] invalid resource key: %s got error: %v", key, err), false 302 } 303 304 podKey := podKeyFunc(namespace, name) 305 defer func() { 306 if !requeue { 307 a.Lock() 308 delete(a.processingPods[podKey], gracePeriodSeconds) 309 310 if len(a.processingPods[podKey]) == 0 { 311 delete(a.processingPods, podKey) 312 } 313 a.Unlock() 314 } 315 }() 316 317 // todo: actually, this function is safe enough without comparing with pod uid 318 // if the same pod is created just after the last one exists 319 // handle with more filters in the future 320 pod, err := a.client.CoreV1().Pods(namespace).Get(context.Background(), name, metav1.GetOptions{}) 321 if err != nil { 322 if apierrors.IsNotFound(err) { 323 klog.Infof("[asynchronous] %s/%s has already been deleted, skip", namespace, name) 324 return nil, false 325 } 326 return err, true 327 } 328 329 var reason, plugin string 330 a.RLock() 331 if a.processingPods[podKey][gracePeriodSeconds] == nil { 332 a.RUnlock() 333 return fmt.Errorf("[asynchronous] evict pod can't be found by podKey: %s and gracePeriodSeconds: %d", podKey, gracePeriodSeconds), false 334 } 335 reason = a.processingPods[podKey][gracePeriodSeconds].Reason 336 plugin = a.processingPods[podKey][gracePeriodSeconds].Plugin 337 a.RUnlock() 338 339 err = a.killer.Evict(context.Background(), pod, gracePeriodSeconds, reason, plugin) 340 if err != nil { 341 return err, true 342 } else { 343 return nil, false 344 } 345 } 346 347 func podKeyFunc(podNamespace, podName string) string { 348 return strings.Join([]string{podNamespace, podName}, consts.KeySeparator) 349 } 350 351 func evictionKeyFunc(podKey string, gracePeriodSeconds int64) string { 352 return strings.Join([]string{podKey, fmt.Sprintf("%d", gracePeriodSeconds)}, consts.KeySeparator) 353 } 354 355 func splitEvictionKey(key string) (string, string, int64, error) { 356 parts := strings.Split(key, consts.KeySeparator) 357 358 if len(parts) != 3 { 359 return "", "", 0, fmt.Errorf("unexpected key format: %s", key) 360 } 361 362 gracePeriodSeconds, err := strconv.ParseInt(parts[2], 10, 64) 363 if err != nil { 364 return "", "", 0, fmt.Errorf("unexpected gracePeriodSeconds: %s", parts[2]) 365 } 366 367 return parts[0], parts[1], gracePeriodSeconds, nil 368 } 369 370 func getGracefulDeletionPeriod(pod *v1.Pod, options *pluginapi.DeletionOptions) (int64, error) { 371 if pod == nil { 372 return 0, fmt.Errorf("getGracefulDeletionPeriod got nil pod") 373 } 374 375 // determine the grace period to use when killing the pod 376 gracePeriod := int64(0) 377 if options != nil { 378 if options.GracePeriodSeconds < 0 { 379 return 0, fmt.Errorf("deletion options with negative grace period seconds") 380 } 381 gracePeriod = options.GracePeriodSeconds 382 } else if pod.Spec.TerminationGracePeriodSeconds != nil { 383 gracePeriod = *pod.Spec.TerminationGracePeriodSeconds 384 } 385 386 return gracePeriod, nil 387 }