github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/evictionmanager/podkiller/killer.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // todo: move APIServer update/patch/create actions to client package
    18  
    19  package podkiller
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"time"
    25  
    26  	"github.com/pkg/errors"
    27  	v1 "k8s.io/api/core/v1"
    28  	policy "k8s.io/api/policy/v1beta1"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/util/wait"
    32  	"k8s.io/client-go/kubernetes"
    33  	"k8s.io/client-go/tools/events"
    34  	cri "k8s.io/cri-api/pkg/apis"
    35  	"k8s.io/klog/v2"
    36  	"k8s.io/kubernetes/pkg/kubelet/container"
    37  	"k8s.io/kubernetes/pkg/kubelet/cri/remote"
    38  
    39  	"github.com/kubewharf/katalyst-core/pkg/config"
    40  	"github.com/kubewharf/katalyst-core/pkg/consts"
    41  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    42  )
    43  
    44  const (
    45  	MetricsNameKillPod       = "kill_pod"
    46  	MetricsNameKillContainer = "kill_container"
    47  )
    48  
    49  type InitFunc func(conf *config.Configuration, client kubernetes.Interface, recorder events.EventRecorder, emitter metrics.MetricEmitter) (Killer, error)
    50  
    51  // Killer implements pod eviction logic.
    52  type Killer interface {
    53  	// Name returns name as identifier for a specific Killer.
    54  	Name() string
    55  
    56  	// Evict for given pods and corresponding graceful period seconds.
    57  	Evict(ctx context.Context, pod *v1.Pod, gracePeriodSeconds int64, reason, plugin string) error
    58  }
    59  
    60  // DummyKiller is a stub implementation for Killer interface.
    61  type DummyKiller struct{}
    62  
    63  func (d DummyKiller) Name() string                                                   { return consts.KillerNameFakeKiller }
    64  func (d DummyKiller) Evict(_ context.Context, _ *v1.Pod, _ int64, _, _ string) error { return nil }
    65  
    66  var _ Killer = DummyKiller{}
    67  
    68  // EvictionAPIKiller implements Killer interface it evict those given pods by
    69  // eviction API, and wait until pods have actually been deleted.
    70  type EvictionAPIKiller struct {
    71  	emitter  metrics.MetricEmitter
    72  	client   kubernetes.Interface
    73  	recorder events.EventRecorder
    74  }
    75  
    76  // NewEvictionAPIKiller returns a new updater Object.
    77  func NewEvictionAPIKiller(_ *config.Configuration, client kubernetes.Interface, recorder events.EventRecorder, emitter metrics.MetricEmitter) (Killer, error) {
    78  	return &EvictionAPIKiller{
    79  		emitter:  emitter,
    80  		client:   client,
    81  		recorder: recorder,
    82  	}, nil
    83  }
    84  
    85  func (e *EvictionAPIKiller) Name() string { return consts.KillerNameEvictionKiller }
    86  
    87  func (e *EvictionAPIKiller) Evict(_ context.Context, pod *v1.Pod, gracePeriodSeconds int64, reason, plugin string) error {
    88  	const (
    89  		policyGroupVersion = "policy/v1beta1"
    90  		evictionKind       = "Eviction"
    91  	)
    92  
    93  	evictPod := func(pod *v1.Pod, gracePeriodOverride int64) error {
    94  		klog.Infof("[eviction-killer] send request for pod %v/%v", pod.Namespace, pod.Name)
    95  
    96  		deleteOptions := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriodOverride}
    97  		eviction := &policy.Eviction{
    98  			TypeMeta: metav1.TypeMeta{
    99  				APIVersion: policyGroupVersion,
   100  				Kind:       evictionKind,
   101  			},
   102  			ObjectMeta: metav1.ObjectMeta{
   103  				Name:      pod.Name,
   104  				Namespace: pod.Namespace,
   105  			},
   106  			DeleteOptions: deleteOptions,
   107  		}
   108  		return e.client.PolicyV1beta1().Evictions(eviction.Namespace).Evict(context.Background(), eviction)
   109  	}
   110  
   111  	return evict(e.client, e.recorder, e.emitter, pod, gracePeriodSeconds, reason, plugin, evictPod)
   112  }
   113  
   114  // DeletionAPIKiller implements Killer interface it evict those
   115  // given pods by calling pod deletion API.
   116  type DeletionAPIKiller struct {
   117  	emitter  metrics.MetricEmitter
   118  	client   kubernetes.Interface
   119  	recorder events.EventRecorder
   120  }
   121  
   122  func NewDeletionAPIKiller(_ *config.Configuration, client kubernetes.Interface, recorder events.EventRecorder, emitter metrics.MetricEmitter) (Killer, error) {
   123  	return &DeletionAPIKiller{
   124  		emitter:  emitter,
   125  		client:   client,
   126  		recorder: recorder,
   127  	}, nil
   128  }
   129  
   130  func (d *DeletionAPIKiller) Name() string { return consts.KillerNameDeletionKiller }
   131  
   132  func (d *DeletionAPIKiller) Evict(ctx context.Context, pod *v1.Pod, gracePeriodSeconds int64, reason, plugin string) error {
   133  	evictPod := func(pod *v1.Pod, gracePeriodOverride int64) error {
   134  		klog.Infof("[deletion-killer] send request for pod %v/%v", pod.Namespace, pod.Name)
   135  
   136  		deleteOptions := metav1.DeleteOptions{GracePeriodSeconds: &gracePeriodOverride}
   137  		return d.client.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, deleteOptions)
   138  	}
   139  
   140  	return evict(d.client, d.recorder, d.emitter, pod, gracePeriodSeconds, reason, plugin, evictPod)
   141  }
   142  
   143  // getWaitingPeriod get waiting period from graceful period.
   144  func getWaitingPeriod(gracePeriod int64) time.Duration {
   145  	// the default timeout is relative to the grace period;
   146  	// settle on 10s to wait for kubelet->runtime traffic to complete in sigkill
   147  	timeout := gracePeriod + gracePeriod/2
   148  	minTimeout := int64(10)
   149  	if timeout < minTimeout {
   150  		timeout = minTimeout
   151  	}
   152  	return time.Duration(timeout) * time.Second
   153  }
   154  
   155  // waitForDeleted wait util pods have been physically deleted from APIServer.
   156  func waitForDeleted(client kubernetes.Interface, pods []*v1.Pod, timeout time.Duration) ([]*v1.Pod, error) {
   157  	const interval = time.Second * 5
   158  	err := wait.PollImmediate(interval, timeout, func() (bool, error) {
   159  		var pendingPods []*v1.Pod
   160  		for i, pod := range pods {
   161  			// todo: refer through ETCD to make sure pods are physically deleted (is it reasonable?)
   162  			p, err := client.CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{})
   163  			if apierrors.IsNotFound(err) || (p != nil && p.ObjectMeta.UID != pod.ObjectMeta.UID) {
   164  				continue
   165  			} else if err != nil {
   166  				return false, err
   167  			} else {
   168  				pendingPods = append(pendingPods, pods[i])
   169  			}
   170  		}
   171  		pods = pendingPods
   172  		if len(pendingPods) > 0 {
   173  			return false, nil
   174  		}
   175  		return true, nil
   176  	})
   177  	return pods, err
   178  }
   179  
   180  // deleteWithRetry keeping calling deletion func until it checks pods
   181  // have been deleted timeout and return an error if it doesn't get a
   182  // callback within a reasonable time.
   183  func deleteWithRetry(pod *v1.Pod, gracePeriod int64, timeoutDuration time.Duration,
   184  	evictPod func(_ *v1.Pod, gracePeriod int64) error,
   185  ) error {
   186  	timeoutTick := time.NewTimer(timeoutDuration)
   187  	for {
   188  		success := false
   189  		select {
   190  		case <-timeoutTick.C:
   191  			return errors.Errorf("eviction request did not complete within %v", timeoutDuration)
   192  		default:
   193  			err := evictPod(pod, gracePeriod)
   194  			if err == nil {
   195  				success = true
   196  				break
   197  			} else if apierrors.IsNotFound(err) {
   198  				success = true
   199  				break
   200  			} else if apierrors.IsTooManyRequests(err) {
   201  				delay, retry := apierrors.SuggestsClientDelay(err)
   202  				if !retry {
   203  					delay = 5
   204  				}
   205  				time.Sleep(time.Duration(delay) * time.Second)
   206  			} else {
   207  				return errors.Errorf("error when evicting pod %q: %v", pod.Name, err)
   208  			}
   209  		}
   210  
   211  		if success {
   212  			break
   213  		}
   214  	}
   215  
   216  	return nil
   217  }
   218  
   219  // evict all killer implementations will perform evict actions.
   220  func evict(client kubernetes.Interface, recorder events.EventRecorder, emitter metrics.MetricEmitter, pod *v1.Pod,
   221  	gracePeriodSeconds int64, reason, plugin string, evictPod func(_ *v1.Pod, gracePeriod int64) error,
   222  ) error {
   223  	timeoutDuration := getWaitingPeriod(gracePeriodSeconds)
   224  	klog.Infof("[killer] evict pod %v/%v with graceful seconds %v", pod.Namespace, pod.Name, gracePeriodSeconds)
   225  
   226  	if err := deleteWithRetry(pod, gracePeriodSeconds, timeoutDuration, evictPod); err != nil {
   227  		recorder.Eventf(pod, nil, v1.EventTypeWarning, consts.EventReasonEvictFailed, consts.EventActionEvicting,
   228  			fmt.Sprintf("Evict failed: %s", err))
   229  		_ = emitter.StoreInt64(MetricsNameKillPod, 1, metrics.MetricTypeNameRaw,
   230  			metrics.MetricTag{Key: "state", Val: "failed"},
   231  			metrics.MetricTag{Key: "pod_ns", Val: pod.Namespace},
   232  			metrics.MetricTag{Key: "pod_name", Val: pod.Name},
   233  			metrics.MetricTag{Key: "plugin_name", Val: plugin})
   234  
   235  		return fmt.Errorf("evict failed %v", err)
   236  	}
   237  
   238  	recorder.Eventf(pod, nil, v1.EventTypeNormal, consts.EventReasonEvictCreated, consts.EventActionEvicting,
   239  		"Successfully create eviction; reason: %s", reason)
   240  	_ = emitter.StoreInt64(MetricsNameKillPod, 1, metrics.MetricTypeNameRaw,
   241  		metrics.MetricTag{Key: "state", Val: "succeeded"},
   242  		metrics.MetricTag{Key: "pod_ns", Val: pod.Namespace},
   243  		metrics.MetricTag{Key: "pod_name", Val: pod.Name},
   244  		metrics.MetricTag{Key: "plugin_name", Val: plugin})
   245  	klog.Infof("[killer] successfully create eviction for pod %v/%v", pod.Namespace, pod.Name)
   246  
   247  	podArray := []*v1.Pod{pod}
   248  	_, err := waitForDeleted(client, podArray, timeoutDuration)
   249  	if err != nil {
   250  		recorder.Eventf(pod, nil, v1.EventTypeWarning, consts.EventReasonEvictExceededGracePeriod, consts.EventActionEvicting,
   251  			"Container runtime did not kill the pod within specified grace period")
   252  
   253  		return fmt.Errorf("container deletion did not complete within %v", timeoutDuration)
   254  	}
   255  
   256  	recorder.Eventf(pod, nil, v1.EventTypeNormal, consts.EventReasonEvictSucceeded, consts.EventActionEvicting,
   257  		"Evicted pod has been deleted physically; reason: %s", reason)
   258  	klog.Infof("[killer] pod %s/%s has been deleted physically", pod.Namespace, pod.Name)
   259  
   260  	return nil
   261  }
   262  
   263  // ContainerKiller implements Killer interface it actually does not evict pod but
   264  // stop containers in given pod directly.
   265  type ContainerKiller struct {
   266  	containerManager cri.ContainerManager
   267  	recorder         events.EventRecorder
   268  	emitter          metrics.MetricEmitter
   269  }
   270  
   271  func NewContainerKiller(conf *config.Configuration, _ kubernetes.Interface, recorder events.EventRecorder, emitter metrics.MetricEmitter) (Killer, error) {
   272  	remoteRuntimeService, err := remote.NewRemoteRuntimeService(conf.RuntimeEndpoint, 2*time.Minute)
   273  	if err != nil {
   274  		return nil, err
   275  	}
   276  
   277  	return &ContainerKiller{
   278  		containerManager: remoteRuntimeService,
   279  		recorder:         recorder,
   280  		emitter:          emitter,
   281  	}, nil
   282  }
   283  
   284  func (c *ContainerKiller) Name() string { return consts.KillerNameContainerKiller }
   285  
   286  func (c *ContainerKiller) Evict(_ context.Context, pod *v1.Pod, gracePeriodSeconds int64, reason, plugin string) error {
   287  	if pod == nil {
   288  		return fmt.Errorf("pod is nil")
   289  	}
   290  
   291  	for _, containerStatus := range pod.Status.ContainerStatuses {
   292  		containerID := container.ParseContainerID(containerStatus.ContainerID)
   293  		err := c.containerManager.StopContainer(containerID.ID, gracePeriodSeconds)
   294  		if err != nil {
   295  			c.recorder.Eventf(pod, nil, v1.EventTypeNormal, consts.EventReasonContainerStopped, consts.EventActionContainerStopping,
   296  				"Failed to kill container %v; reason: %s", containerStatus.Name, reason)
   297  			_ = c.emitter.StoreInt64(MetricsNameKillContainer, 1, metrics.MetricTypeNameRaw,
   298  				metrics.MetricTag{Key: "state", Val: "failed"},
   299  				metrics.MetricTag{Key: "pod_ns", Val: pod.Namespace},
   300  				metrics.MetricTag{Key: "pod_name", Val: pod.Name},
   301  				metrics.MetricTag{Key: "container_name", Val: containerStatus.Name},
   302  				metrics.MetricTag{Key: "plugin_name", Val: plugin})
   303  			klog.Infof("[killer] failed to kill container %v(containerID: %v) for pod %v/%v, error:%v", containerStatus.Name, containerID, pod.Namespace, pod.Name, err)
   304  			return fmt.Errorf("ContainerKiller stop container %v failed with error: %v", containerStatus.ContainerID, err)
   305  		}
   306  		c.recorder.Eventf(pod, nil, v1.EventTypeNormal, consts.EventReasonContainerStopped, consts.EventActionContainerStopping,
   307  			"Successfully kill container %v; reason: %s", containerStatus.Name, reason)
   308  		_ = c.emitter.StoreInt64(MetricsNameKillContainer, 1, metrics.MetricTypeNameRaw,
   309  			metrics.MetricTag{Key: "state", Val: "succeeded"},
   310  			metrics.MetricTag{Key: "pod_ns", Val: pod.Namespace},
   311  			metrics.MetricTag{Key: "pod_name", Val: pod.Name},
   312  			metrics.MetricTag{Key: "container_name", Val: containerStatus.Name},
   313  			metrics.MetricTag{Key: "plugin_name", Val: plugin})
   314  		klog.Infof("[killer] successfully kill container %v/%v for pod %v/%v", containerStatus.Name, containerStatus.ContainerID, pod.Namespace, pod.Name)
   315  	}
   316  	// TODO: do we have to wait for container being completely killed?
   317  
   318  	return nil
   319  }