github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/evictionmanager/podkiller/podkiller.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package podkiller
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"strconv"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/util/errors"
    32  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  	"k8s.io/client-go/kubernetes"
    35  	"k8s.io/client-go/util/workqueue"
    36  	"k8s.io/klog/v2"
    37  
    38  	pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1"
    39  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/rule"
    40  	"github.com/kubewharf/katalyst-core/pkg/consts"
    41  )
    42  
    43  // PodKiller implements the killing actions for given pods.
    44  type PodKiller interface {
    45  	// Name returns name as identifier for a specific Killer.
    46  	Name() string
    47  
    48  	// Start pod killer logic, prepare to receive on-killing pods.
    49  	Start(ctx context.Context)
    50  
    51  	// EvictPods send on-killing pods to pod killer.
    52  	EvictPods(rpList rule.RuledEvictPodList) error
    53  
    54  	// EvictPod a pod with the specified grace period.
    55  	EvictPod(rp *rule.RuledEvictPod) error
    56  }
    57  
    58  // DummyPodKiller is a stub implementation for Killer interface.
    59  type DummyPodKiller struct{}
    60  
    61  func (d DummyPodKiller) Name() string                           { return "dummy-pod-killer" }
    62  func (d DummyPodKiller) Start(_ context.Context)                {}
    63  func (d DummyPodKiller) EvictPods(rule.RuledEvictPodList) error { return nil }
    64  func (d DummyPodKiller) EvictPod(*rule.RuledEvictPod) error     { return nil }
    65  
    66  var _ PodKiller = DummyPodKiller{}
    67  
    68  // SynchronizedPodKiller trigger killing actions immediately after
    69  // receiving killing requests; only returns true if all pods are
    70  // successfully evicted.
    71  type SynchronizedPodKiller struct {
    72  	killer Killer
    73  }
    74  
    75  func NewSynchronizedPodKiller(killer Killer) PodKiller {
    76  	return &SynchronizedPodKiller{
    77  		killer: killer,
    78  	}
    79  }
    80  
    81  func (s *SynchronizedPodKiller) Name() string { return "synchronized-pod-killer" }
    82  
    83  func (s *SynchronizedPodKiller) Start(_ context.Context) {
    84  	klog.Infof("[synchronized] pod-killer run with killer %v", s.killer.Name())
    85  	defer klog.Infof("[synchronized] pod-killer started")
    86  }
    87  
    88  func (s *SynchronizedPodKiller) EvictPod(rp *rule.RuledEvictPod) error {
    89  	if rp == nil || rp.Pod == nil {
    90  		return fmt.Errorf("EvictPod got nil pod")
    91  	}
    92  
    93  	gracePeriod, err := getGracefulDeletionPeriod(rp.Pod, rp.DeletionOptions)
    94  	if err != nil {
    95  		return fmt.Errorf("getGracefulDeletionPeriod for pod: %s/%s failed with error: %v", rp.Pod.Namespace, rp.Pod.Name, err)
    96  	}
    97  
    98  	err = s.killer.Evict(context.Background(), rp.Pod, gracePeriod, rp.Reason, rp.EvictionPluginName)
    99  	if err != nil {
   100  		return fmt.Errorf("evict pod: %s/%s failed with error: %v", rp.Pod.Namespace, rp.Pod.Name, err)
   101  	}
   102  
   103  	return nil
   104  }
   105  
   106  func (s *SynchronizedPodKiller) EvictPods(rpList rule.RuledEvictPodList) error {
   107  	var errList []error
   108  	var mtx sync.Mutex
   109  
   110  	klog.Infof("[synchronized] pod-killer evict %d totally", len(rpList))
   111  	syncNodeUtilizationAndAdjust := func(i int) {
   112  		err := s.EvictPod(rpList[i])
   113  
   114  		mtx.Lock()
   115  		if err != nil {
   116  			errList = append(errList, err)
   117  		}
   118  		mtx.Unlock()
   119  	}
   120  	workqueue.ParallelizeUntil(context.Background(), 3, len(rpList), syncNodeUtilizationAndAdjust)
   121  
   122  	klog.Infof("[synchronized] successfully evict %d totally", len(rpList)-len(errList))
   123  	return errors.NewAggregate(errList)
   124  }
   125  
   126  // AsynchronizedPodKiller pushed killing actions into a queue and
   127  // returns true directly, another go routine will be responsible
   128  // to perform killing actions instead.
   129  type AsynchronizedPodKiller struct {
   130  	killer Killer
   131  
   132  	client kubernetes.Interface
   133  
   134  	// use map to act as a limited queue
   135  	queue workqueue.RateLimitingInterface
   136  
   137  	// processingPods is used to store pods that are being evicted
   138  	// the map is constructed as podName -> gracefulPeriod -> evictPodInfo
   139  	processingPods map[string]map[int64]*evictPodInfo
   140  
   141  	sync.RWMutex
   142  }
   143  
   144  type evictPodInfo struct {
   145  	Pod    *v1.Pod
   146  	Reason string
   147  	Plugin string
   148  }
   149  
   150  func getEvictPodInfo(rp *rule.RuledEvictPod) *evictPodInfo {
   151  	return &evictPodInfo{
   152  		Pod:    rp.Pod.DeepCopy(),
   153  		Reason: rp.Reason,
   154  		Plugin: rp.EvictionPluginName,
   155  	}
   156  }
   157  
   158  func NewAsynchronizedPodKiller(killer Killer, client kubernetes.Interface) PodKiller {
   159  	a := &AsynchronizedPodKiller{
   160  		killer:         killer,
   161  		client:         client,
   162  		processingPods: make(map[string]map[int64]*evictPodInfo),
   163  	}
   164  	a.queue = workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), a.Name())
   165  	return a
   166  }
   167  
   168  func (a *AsynchronizedPodKiller) Name() string { return "asynchronous-pod-killer" }
   169  
   170  func (a *AsynchronizedPodKiller) Start(ctx context.Context) {
   171  	klog.Infof("[asynchronous] pod-killer run with killer %v", a.killer.Name())
   172  	defer klog.Infof("[asynchronous] pod-killer started")
   173  
   174  	for i := 0; i < 10; i++ {
   175  		go wait.Until(a.run, time.Second, ctx.Done())
   176  	}
   177  }
   178  
   179  func (a *AsynchronizedPodKiller) EvictPods(rpList rule.RuledEvictPodList) error {
   180  	klog.Infof("[asynchronous] pod-killer evict %d totally", len(rpList))
   181  
   182  	errList := make([]error, 0, len(rpList))
   183  	for _, rp := range rpList {
   184  		err := a.EvictPod(rp)
   185  		if err != nil {
   186  			errList = append(errList, err)
   187  		}
   188  	}
   189  
   190  	klog.Infof("[asynchronous] successfully add %d pods to eviction queue", len(rpList)-len(errList))
   191  	return errors.NewAggregate(errList)
   192  }
   193  
   194  func (a *AsynchronizedPodKiller) EvictPod(rp *rule.RuledEvictPod) error {
   195  	if rp == nil || rp.Pod == nil {
   196  		return fmt.Errorf("evictPod got nil pod")
   197  	}
   198  
   199  	gracePeriod, err := getGracefulDeletionPeriod(rp.Pod, rp.DeletionOptions)
   200  	if err != nil {
   201  		return fmt.Errorf("getGracefulDeletionPeriod for pod: %s/%s failed with error: %v", rp.Pod.Namespace, rp.Pod.Name, err)
   202  	}
   203  	podKey := podKeyFunc(rp.Pod.Namespace, rp.Pod.Name)
   204  
   205  	a.Lock()
   206  	if a.processingPods[podKey] != nil {
   207  		var minOne int64 = math.MaxInt64
   208  		for recordedGracePeriod := range a.processingPods[podKey] {
   209  			if recordedGracePeriod < minOne {
   210  				minOne = recordedGracePeriod
   211  			}
   212  		}
   213  
   214  		if gracePeriod >= minOne {
   215  			a.Unlock()
   216  			klog.Infof("[asynchronous] pod: %s/%s is being processed with smaller grace period, skip it", rp.Pod.Namespace, rp.Pod.Name)
   217  			return nil
   218  		}
   219  	}
   220  
   221  	if a.processingPods[podKey] == nil {
   222  		a.processingPods[podKey] = make(map[int64]*evictPodInfo)
   223  	}
   224  
   225  	a.processingPods[podKey][gracePeriod] = getEvictPodInfo(rp)
   226  	a.Unlock()
   227  
   228  	a.queue.AddRateLimited(evictionKeyFunc(podKey, gracePeriod))
   229  	return nil
   230  }
   231  
   232  // run is a long-running function that will continually call the
   233  // processNextItem function in order to read and process a message on the queue.
   234  func (a *AsynchronizedPodKiller) run() {
   235  	for a.processNextItem() {
   236  	}
   237  }
   238  
   239  // processNextItem will read a single work item off the queue and
   240  // attempt to process it, by calling the sync function.
   241  func (a *AsynchronizedPodKiller) processNextItem() bool {
   242  	obj, shutdown := a.queue.Get()
   243  	if shutdown {
   244  		return false
   245  	}
   246  
   247  	// We wrap this block in a func so we can defer c.workqueue.Done.
   248  	err := func(obj interface{}) error {
   249  		// We call Done here so the workqueue knows we have finished
   250  		// processing this item. We also must remember to call Forget if we
   251  		// do not want this work item being re-queued. For example, we do
   252  		// not call Forget if a transient error occurs, instead the item is
   253  		// put back on the workqueue and attempted again after a back-off
   254  		// period.
   255  		defer a.queue.Done(obj)
   256  		var key string
   257  		var ok bool
   258  		// We expect strings to come off the workqueue. These are of the
   259  		// form namespace/name. We do this as the delayed nature of the
   260  		// workqueue means the items in the informer cache may actually be
   261  		// more up to date that when the item was initially put onto the
   262  		// workqueue.
   263  		if key, ok = obj.(string); !ok {
   264  			// As the item in the workqueue is actually invalid, we call
   265  			// Forget here else we'd go into a loop of attempting to
   266  			// process a work item that is invalid.
   267  			a.queue.Forget(obj)
   268  			utilruntime.HandleError(fmt.Errorf("expected string in workqueue but got %#v", obj))
   269  			return nil
   270  		}
   271  		// Run the syncHandler, passing it the namespace/name string of the
   272  		// ExecDeploy resource to be synced.
   273  		if err, requeue := a.sync(key); err != nil {
   274  			// Put the item back on the workqueue to handle any transient errors.
   275  			klog.Warningf("[asynchronous] error syncing '%s': %s, requeuing", key, err.Error())
   276  
   277  			if requeue {
   278  				a.queue.AddRateLimited(key)
   279  			} else {
   280  				a.queue.Forget(obj)
   281  			}
   282  
   283  			return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error())
   284  		}
   285  		// Finally, if no error occurs we Forget this item so it does not
   286  		// get queued again until another change happens.
   287  		a.queue.Forget(obj)
   288  		return nil
   289  	}(obj)
   290  	if err != nil {
   291  		utilruntime.HandleError(err)
   292  		return true
   293  	}
   294  
   295  	return true
   296  }
   297  
   298  func (a *AsynchronizedPodKiller) sync(key string) (retError error, requeue bool) {
   299  	namespace, name, gracePeriodSeconds, err := splitEvictionKey(key)
   300  	if err != nil {
   301  		return fmt.Errorf("[asynchronous] invalid resource key: %s got error: %v", key, err), false
   302  	}
   303  
   304  	podKey := podKeyFunc(namespace, name)
   305  	defer func() {
   306  		if !requeue {
   307  			a.Lock()
   308  			delete(a.processingPods[podKey], gracePeriodSeconds)
   309  
   310  			if len(a.processingPods[podKey]) == 0 {
   311  				delete(a.processingPods, podKey)
   312  			}
   313  			a.Unlock()
   314  		}
   315  	}()
   316  
   317  	// todo: actually, this function is safe enough without comparing with pod uid
   318  	//  if the same pod is created just after the last one exists
   319  	//  handle with more filters in the future
   320  	pod, err := a.client.CoreV1().Pods(namespace).Get(context.Background(), name, metav1.GetOptions{})
   321  	if err != nil {
   322  		if apierrors.IsNotFound(err) {
   323  			klog.Infof("[asynchronous] %s/%s has already been deleted, skip", namespace, name)
   324  			return nil, false
   325  		}
   326  		return err, true
   327  	}
   328  
   329  	var reason, plugin string
   330  	a.RLock()
   331  	if a.processingPods[podKey][gracePeriodSeconds] == nil {
   332  		a.RUnlock()
   333  		return fmt.Errorf("[asynchronous] evict pod can't be found by podKey: %s and gracePeriodSeconds: %d", podKey, gracePeriodSeconds), false
   334  	}
   335  	reason = a.processingPods[podKey][gracePeriodSeconds].Reason
   336  	plugin = a.processingPods[podKey][gracePeriodSeconds].Plugin
   337  	a.RUnlock()
   338  
   339  	err = a.killer.Evict(context.Background(), pod, gracePeriodSeconds, reason, plugin)
   340  	if err != nil {
   341  		return err, true
   342  	} else {
   343  		return nil, false
   344  	}
   345  }
   346  
   347  func podKeyFunc(podNamespace, podName string) string {
   348  	return strings.Join([]string{podNamespace, podName}, consts.KeySeparator)
   349  }
   350  
   351  func evictionKeyFunc(podKey string, gracePeriodSeconds int64) string {
   352  	return strings.Join([]string{podKey, fmt.Sprintf("%d", gracePeriodSeconds)}, consts.KeySeparator)
   353  }
   354  
   355  func splitEvictionKey(key string) (string, string, int64, error) {
   356  	parts := strings.Split(key, consts.KeySeparator)
   357  
   358  	if len(parts) != 3 {
   359  		return "", "", 0, fmt.Errorf("unexpected key format: %s", key)
   360  	}
   361  
   362  	gracePeriodSeconds, err := strconv.ParseInt(parts[2], 10, 64)
   363  	if err != nil {
   364  		return "", "", 0, fmt.Errorf("unexpected gracePeriodSeconds: %s", parts[2])
   365  	}
   366  
   367  	return parts[0], parts[1], gracePeriodSeconds, nil
   368  }
   369  
   370  func getGracefulDeletionPeriod(pod *v1.Pod, options *pluginapi.DeletionOptions) (int64, error) {
   371  	if pod == nil {
   372  		return 0, fmt.Errorf("getGracefulDeletionPeriod got nil pod")
   373  	}
   374  
   375  	// determine the grace period to use when killing the pod
   376  	gracePeriod := int64(0)
   377  	if options != nil {
   378  		if options.GracePeriodSeconds < 0 {
   379  			return 0, fmt.Errorf("deletion options with negative grace period seconds")
   380  		}
   381  		gracePeriod = options.GracePeriodSeconds
   382  	} else if pod.Spec.TerminationGracePeriodSeconds != nil {
   383  		gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
   384  	}
   385  
   386  	return gracePeriod, nil
   387  }