github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/agent-healthz/helper/evict.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package helper
    18  
    19  import (
    20  	"context"
    21  	"time"
    22  
    23  	corev1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    27  	"k8s.io/apimachinery/pkg/util/sets"
    28  	"k8s.io/apimachinery/pkg/util/wait"
    29  	corelisters "k8s.io/client-go/listers/core/v1"
    30  	"k8s.io/client-go/tools/cache"
    31  	"k8s.io/klog/v2"
    32  	"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
    33  
    34  	listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1"
    35  	"github.com/kubewharf/katalyst-core/pkg/client/control"
    36  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    37  )
    38  
    39  const metricsNameEvictedReclaimedPodCount = "evicted_reclaimed_pod_count"
    40  
    41  // EvictItem records the detailed item to perform pod eviction
    42  type EvictItem struct {
    43  	// PodKeys maps from agent-name to pod-keys (that should be evicted because of the Agents)
    44  	PodKeys map[string][]string
    45  }
    46  
    47  type EvictHelper struct {
    48  	ctx        context.Context
    49  	emitter    metrics.MetricEmitter
    50  	podControl control.PodEjector
    51  
    52  	checker *HealthzHelper
    53  	queue   *scheduler.RateLimitedTimedQueue
    54  
    55  	podLister  corelisters.PodLister
    56  	nodeLister corelisters.NodeLister
    57  	cnrLister  listers.CustomNodeResourceLister
    58  }
    59  
    60  // NewEvictHelper todo add logic here
    61  func NewEvictHelper(ctx context.Context, emitter metrics.MetricEmitter,
    62  	podControl control.PodEjector, nodeLister corelisters.NodeLister, cnrLister listers.CustomNodeResourceLister,
    63  	queue *scheduler.RateLimitedTimedQueue, checker *HealthzHelper,
    64  ) *EvictHelper {
    65  	return &EvictHelper{
    66  		ctx:        ctx,
    67  		emitter:    emitter,
    68  		podControl: podControl,
    69  
    70  		queue:   queue,
    71  		checker: checker,
    72  
    73  		nodeLister: nodeLister,
    74  		cnrLister:  cnrLister,
    75  	}
    76  }
    77  
    78  func (e *EvictHelper) Run() {
    79  	go wait.Until(e.doEviction, scheduler.NodeEvictionPeriod, e.ctx.Done())
    80  }
    81  
    82  // doEviction is used to pop nodes from to-be-evicted queue,
    83  // and then trigger the taint actions
    84  func (e *EvictHelper) doEviction() {
    85  	e.queue.Try(func(value scheduler.TimedValue) (bool, time.Duration) {
    86  		node, err := e.nodeLister.Get(value.Value)
    87  		if errors.IsNotFound(err) {
    88  			klog.Warningf("Node %v no longer present in nodeLister", value.Value)
    89  			return true, 0
    90  		} else if err != nil {
    91  			klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err)
    92  			// retry in 50 millisecond
    93  			return false, 50 * time.Millisecond
    94  		}
    95  
    96  		// second confirm that we should evict reclaimed pods
    97  		keys := sets.NewString()
    98  		item := value.UID.(*EvictItem)
    99  		for agent, names := range item.PodKeys {
   100  			if !e.checker.CheckAgentReady(node.Name, agent) {
   101  				keys.Insert(names...)
   102  			}
   103  		}
   104  
   105  		if err := e.evictPods(node, keys.List()); err != nil {
   106  			klog.Warningf("failed to evict pods for cnr %v: %v", value.Value, err)
   107  			return true, 5 * time.Second
   108  		}
   109  		return true, 0
   110  	})
   111  }
   112  
   113  // evictPods must filter out those pods that should be managed
   114  // todo evict pods in with concurrency if necessary
   115  func (e *EvictHelper) evictPods(node *corev1.Node, keys []string) error {
   116  	var errList []error
   117  	for _, key := range keys {
   118  		namespace, name, err := cache.SplitMetaNamespaceKey(key)
   119  		if err != nil {
   120  			klog.Errorf("failed to split namespace and name from key %s", key)
   121  			continue
   122  		}
   123  
   124  		delErr := e.podControl.DeletePod(e.ctx, namespace, name, metav1.DeleteOptions{})
   125  		if delErr != nil {
   126  			_ = e.emitter.StoreInt64(metricsNameEvictedReclaimedPodCount, 1, metrics.MetricTypeNameCount,
   127  				[]metrics.MetricTag{
   128  					{Key: "status", Val: "failed"},
   129  					{Key: "name", Val: node.Name},
   130  				}...)
   131  			errList = append(errList, delErr)
   132  			continue
   133  		}
   134  
   135  		_ = e.emitter.StoreInt64(metricsNameEvictedReclaimedPodCount, 1, metrics.MetricTypeNameCount,
   136  			[]metrics.MetricTag{
   137  				{Key: "status", Val: "success"},
   138  				{Key: "name", Val: node.Name},
   139  			}...)
   140  	}
   141  	if len(errList) > 0 {
   142  		return utilerrors.NewAggregate(errList)
   143  	}
   144  
   145  	return nil
   146  }