github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/evictionmanager/manager.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package evictionmanager is the package that contains the libraries that drive the Kubelet binary.
    18  // The kubelet is responsible for node level pod management.  It runs on each worker in the cluster.
    19  package evictionmanager // import "github.com/kubewharf/katalyst-core/pkg/evictionmanager"
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	//nolint
    30  	"github.com/golang/protobuf/proto"
    31  	v1 "k8s.io/api/core/v1"
    32  	"k8s.io/apimachinery/pkg/util/errors"
    33  	"k8s.io/apimachinery/pkg/util/sets"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	"k8s.io/client-go/tools/events"
    36  	clocks "k8s.io/utils/clock"
    37  
    38  	"github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    39  	"github.com/kubewharf/katalyst-api/pkg/plugins/registration"
    40  	pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1"
    41  	endpointpkg "github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/endpoint"
    42  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin"
    43  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/memory"
    44  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/resource"
    45  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin/rootfs"
    46  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/podkiller"
    47  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/rule"
    48  	"github.com/kubewharf/katalyst-core/pkg/client"
    49  	pkgconfig "github.com/kubewharf/katalyst-core/pkg/config"
    50  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    51  	"github.com/kubewharf/katalyst-core/pkg/consts"
    52  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    53  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    54  	"github.com/kubewharf/katalyst-core/pkg/util/credential"
    55  	"github.com/kubewharf/katalyst-core/pkg/util/credential/authorization"
    56  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    57  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    58  )
    59  
    60  const (
    61  	MetricsNameVictimPodCNT           = "victims_cnt"
    62  	MetricsNameRunningPodCNT          = "running_pod_cnt"
    63  	MetricsNameCandidatePodCNT        = "candidate_pod_cnt"
    64  	MetricsNameDryRunVictimPodCNT     = "dryrun_victims_cnt"
    65  	MetricsNameRequestConditionCNT    = "request_condition_cnt"
    66  	MetricsNameEvictionPluginCalled   = "eviction_plugin_called"
    67  	MetricsNameEvictionPluginValidate = "eviction_plugin_validate"
    68  
    69  	ValidateFailedReasonGetTokenFailed     = "get_token_failed"
    70  	ValidateFailedReasonAuthenticateFailed = "authenticate_failed"
    71  	ValidateFailedReasonNoPermission       = "no_permission"
    72  
    73  	UserUnknown = "unknown"
    74  
    75  	MetricsPodLabelPrefix = "pod"
    76  
    77  	evictionManagerHealthCheckName = "eviction_manager_sync"
    78  	reportTaintHealthCheckName     = "eviction_manager_report_taint"
    79  	syncTolerationTurns            = 3
    80  	reportTaintToleration          = 15 * time.Second
    81  )
    82  
    83  // LatestCNRGetter returns the latest CNR resources.
    84  type LatestCNRGetter func() *v1alpha1.CustomNodeResource
    85  
    86  // LatestPodsGetter returns the latest pods that are running.
    87  type LatestPodsGetter func() []*v1.Pod
    88  
    89  // EvictionManger reconciles to check if some threshold has been met, and
    90  // trigger pod eviction actions if needed.
    91  type EvictionManger struct {
    92  	conf          *pkgconfig.Configuration
    93  	genericClient *client.GenericClientSet
    94  
    95  	endpointLock  sync.RWMutex
    96  	conditionLock sync.RWMutex
    97  
    98  	// clock is an interface that provides time related functionality in a way that makes it
    99  	// easy to test the code.
   100  	clock clocks.WithTickerAndDelayedExecution
   101  
   102  	podKiller podkiller.PodKiller
   103  
   104  	killQueue    rule.EvictionQueue
   105  	killStrategy rule.EvictionStrategy
   106  
   107  	// metaGetter is used to collect metadata universal metaServer.
   108  	metaGetter *metaserver.MetaServer
   109  	// emitter is used to emit metrics.
   110  	emitter metrics.MetricEmitter
   111  
   112  	// endpoints cache registered eviction plugin endpoints.
   113  	endpoints map[string]endpointpkg.Endpoint
   114  	// conditions map condition name to *pluginapi.Condition, and they will be reported to node or CNR.
   115  	conditions map[string]*pluginapi.Condition
   116  
   117  	// conditionsLastObservedAt map condition name to *pluginapi.Condition with latest observed timestamp.
   118  	conditionsLastObservedAt map[string]conditionObservedAt
   119  	// thresholdsFirstObservedAt map eviction plugin name to *pluginapi.Condition with firstly observed timestamp.
   120  	thresholdsFirstObservedAt map[string]thresholdObservedAt
   121  
   122  	cred credential.Credential
   123  	auth authorization.AccessControl
   124  }
   125  
   126  var InnerEvictionPluginsDisabledByDefault = sets.NewString()
   127  
   128  func NewInnerEvictionPluginInitializers() map[string]plugin.InitFunc {
   129  	innerEvictionPluginInitializers := make(map[string]plugin.InitFunc)
   130  	innerEvictionPluginInitializers[resource.ReclaimedResourcesEvictionPluginName] = resource.NewReclaimedResourcesEvictionPlugin
   131  	innerEvictionPluginInitializers[memory.EvictionPluginNameNumaMemoryPressure] = memory.NewNumaMemoryPressureEvictionPlugin
   132  	innerEvictionPluginInitializers[memory.EvictionPluginNameSystemMemoryPressure] = memory.NewSystemPressureEvictionPlugin
   133  	innerEvictionPluginInitializers[memory.EvictionPluginNameRssOveruse] = memory.NewRssOveruseEvictionPlugin
   134  	innerEvictionPluginInitializers[rootfs.EvictionPluginNamePodRootfsPressure] = rootfs.NewPodRootfsPressureEvictionPlugin
   135  	return innerEvictionPluginInitializers
   136  }
   137  
   138  func NewPodKillerInitializers() map[string]podkiller.InitFunc {
   139  	podKillerInitializers := make(map[string]podkiller.InitFunc)
   140  	podKillerInitializers[consts.KillerNameEvictionKiller] = podkiller.NewEvictionAPIKiller
   141  	podKillerInitializers[consts.KillerNameDeletionKiller] = podkiller.NewDeletionAPIKiller
   142  	podKillerInitializers[consts.KillerNameContainerKiller] = podkiller.NewContainerKiller
   143  	return podKillerInitializers
   144  }
   145  
   146  func NewEvictionManager(genericClient *client.GenericClientSet, recorder events.EventRecorder,
   147  	metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, conf *pkgconfig.Configuration,
   148  ) (*EvictionManger, error) {
   149  	queue := rule.NewFIFOEvictionQueue(conf.EvictionBurst)
   150  
   151  	podKillerInitializers := NewPodKillerInitializers()
   152  	var killer podkiller.Killer
   153  	if initFunc, ok := podKillerInitializers[conf.PodKiller]; ok {
   154  		var initErr error
   155  		killer, initErr = initFunc(conf, genericClient.KubeClient, recorder, emitter)
   156  		if initErr != nil {
   157  			return nil, fmt.Errorf("failed to init pod killer %v: %v", conf.PodKiller, initErr)
   158  		}
   159  	} else {
   160  		return nil, fmt.Errorf("unsupported pod killer %v", conf.PodKiller)
   161  	}
   162  
   163  	podKiller := podkiller.NewAsynchronizedPodKiller(killer, genericClient.KubeClient)
   164  
   165  	e := &EvictionManger{
   166  		killQueue:    queue,
   167  		killStrategy: rule.NewEvictionStrategyImpl(conf),
   168  
   169  		metaGetter:                metaServer,
   170  		emitter:                   emitter,
   171  		podKiller:                 podKiller,
   172  		endpoints:                 make(map[string]endpointpkg.Endpoint),
   173  		conf:                      conf,
   174  		conditions:                make(map[string]*pluginapi.Condition),
   175  		conditionsLastObservedAt:  make(map[string]conditionObservedAt),
   176  		thresholdsFirstObservedAt: make(map[string]thresholdObservedAt),
   177  		clock:                     clocks.RealClock{},
   178  		genericClient:             genericClient,
   179  		cred:                      credential.DefaultCredential(),
   180  		auth:                      authorization.DefaultAccessControl(),
   181  	}
   182  
   183  	cred, credErr := credential.GetCredential(conf.GenericConfiguration, conf.DynamicAgentConfiguration)
   184  	if credErr != nil {
   185  		return nil, credErr
   186  	}
   187  	e.cred = cred
   188  
   189  	accessControl, acErr := authorization.GetAccessControl(conf.GenericConfiguration, conf.DynamicAgentConfiguration)
   190  	if acErr != nil {
   191  		return nil, acErr
   192  	}
   193  	e.auth = accessControl
   194  
   195  	e.getEvictionPlugins(genericClient, recorder, metaServer, emitter, conf, NewInnerEvictionPluginInitializers())
   196  	return e, nil
   197  }
   198  
   199  func (m *EvictionManger) getEvictionPlugins(genericClient *client.GenericClientSet, recorder events.EventRecorder, metaServer *metaserver.MetaServer,
   200  	emitter metrics.MetricEmitter, conf *pkgconfig.Configuration, innerEvictionPluginInitializers map[string]plugin.InitFunc,
   201  ) {
   202  	m.endpointLock.Lock()
   203  	for pluginName, initFn := range innerEvictionPluginInitializers {
   204  		if !general.IsNameEnabled(pluginName, InnerEvictionPluginsDisabledByDefault, conf.GenericEvictionConfiguration.InnerPlugins) {
   205  			general.Warningf(" %s is disabled", pluginName)
   206  			continue
   207  		}
   208  
   209  		curPlugin := initFn(genericClient, recorder, metaServer, emitter, conf)
   210  		m.endpoints[curPlugin.Name()] = curPlugin
   211  	}
   212  	m.endpointLock.Unlock()
   213  }
   214  
   215  func (m *EvictionManger) Run(ctx context.Context) {
   216  	general.Infof(" run with podKiller %v", m.podKiller.Name())
   217  	defer general.Infof(" started")
   218  	general.RegisterHeartbeatCheck(evictionManagerHealthCheckName, syncTolerationTurns*m.conf.EvictionManagerSyncPeriod,
   219  		general.HealthzCheckStateNotReady, syncTolerationTurns*m.conf.EvictionManagerSyncPeriod)
   220  	general.RegisterHeartbeatCheck(reportTaintHealthCheckName, reportTaintToleration,
   221  		general.HealthzCheckStateNotReady, reportTaintToleration)
   222  	m.podKiller.Start(ctx)
   223  	for _, endpoint := range m.endpoints {
   224  		endpoint.Start()
   225  	}
   226  	m.cred.Run(ctx)
   227  	m.auth.Run(ctx)
   228  	go wait.UntilWithContext(ctx, m.sync, m.conf.EvictionManagerSyncPeriod)
   229  	go wait.UntilWithContext(ctx, m.reportConditionsAsNodeTaints, time.Second*5)
   230  	<-ctx.Done()
   231  }
   232  
   233  func (m *EvictionManger) sync(ctx context.Context) {
   234  	var err error
   235  	defer func() {
   236  		_ = general.UpdateHealthzStateByError(evictionManagerHealthCheckName, err)
   237  	}()
   238  
   239  	activePods, err := m.metaGetter.GetPodList(ctx, native.PodIsActive)
   240  	if err != nil {
   241  		general.Errorf("failed to list pods from metaServer: %v", err)
   242  		return
   243  	}
   244  
   245  	general.Infof(" currently, there are %v active pods", len(activePods))
   246  	_ = m.emitter.StoreInt64(MetricsNameRunningPodCNT, int64(len(activePods)), metrics.MetricTypeNameRaw)
   247  
   248  	pods := native.FilterOutSkipEvictionPods(activePods, m.conf.EvictionSkippedAnnotationKeys, m.conf.EvictionSkippedLabelKeys)
   249  	general.Infof(" currently, there are %v candidate pods", len(pods))
   250  	_ = m.emitter.StoreInt64(MetricsNameCandidatePodCNT, int64(len(pods)), metrics.MetricTypeNameRaw)
   251  
   252  	errList := make([]error, 0)
   253  	collector, collectErr := m.collectEvictionResult(pods)
   254  	if collectErr != nil {
   255  		errList = append(errList, collectErr)
   256  	}
   257  
   258  	evictErr := m.doEvict(collector.getSoftEvictPods(), collector.getForceEvictPods())
   259  	if evictErr != nil {
   260  		errList = append(errList, evictErr)
   261  	}
   262  	if len(errList) > 0 {
   263  		err = errors.NewAggregate(errList)
   264  	}
   265  }
   266  
   267  func (m *EvictionManger) collectEvictionResult(pods []*v1.Pod) (*evictionRespCollector, error) {
   268  	dynamicConfig := m.conf.GetDynamicConfiguration()
   269  	collector := newEvictionRespCollector(dynamicConfig.DryRun, m.conf, m.emitter)
   270  	var errList []error
   271  
   272  	m.endpointLock.RLock()
   273  	for pluginName, ep := range m.endpoints {
   274  		_ = m.emitter.StoreInt64(MetricsNameEvictionPluginCalled, 1, metrics.MetricTypeNameCount,
   275  			metrics.MetricTag{Key: "name", Val: pluginName})
   276  
   277  		getEvictResp, err := ep.GetEvictPods(context.Background(), &pluginapi.GetEvictPodsRequest{
   278  			ActivePods: pods,
   279  		})
   280  		if err != nil {
   281  			general.Errorf(" calling GetEvictPods of plugin: %s failed with error: %v", pluginName, err)
   282  			errList = append(errList, err)
   283  		} else if getEvictResp == nil {
   284  			general.Errorf(" calling GetEvictPods of plugin: %s and getting nil resp", pluginName)
   285  		} else {
   286  			general.Infof(" GetEvictPods of plugin: %s with %d pods to evict", pluginName, len(getEvictResp.EvictPods))
   287  			collector.collectEvictPods(dynamicConfig.DryRun, pluginName, getEvictResp)
   288  		}
   289  
   290  		metResp, err := ep.ThresholdMet(context.Background())
   291  		if err != nil {
   292  			general.Errorf(" calling ThresholdMet of plugin: %s failed with error: %v", pluginName, err)
   293  			errList = append(errList, err)
   294  			continue
   295  		} else if metResp == nil {
   296  			general.Errorf(" calling ThresholdMet of plugin: %s and getting nil resp", pluginName)
   297  			continue
   298  		}
   299  
   300  		collector.collectMetThreshold(dynamicConfig.DryRun, pluginName, metResp)
   301  	}
   302  	m.endpointLock.RUnlock()
   303  
   304  	// track when a threshold was first observed
   305  	now := m.clock.Now()
   306  	thresholdsFirstObservedAt := thresholdsFirstObservedAt(collector.currentMetThresholds, m.thresholdsFirstObservedAt, now)
   307  	thresholdsMet := thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
   308  	logConfirmedThresholdMet(thresholdsMet)
   309  
   310  	// track when a condition was last observed
   311  	conditionsLastObservedAt := conditionsLastObservedAt(collector.currentConditions, m.conditionsLastObservedAt, now)
   312  	// conditions report true if it has been observed within the transition period window
   313  	conditions := conditionsObservedSince(conditionsLastObservedAt, m.conf.ConditionTransitionPeriod, now)
   314  	logConfirmedConditions(conditions)
   315  
   316  	m.conditionLock.Lock()
   317  	m.conditions = conditions
   318  	m.conditionsLastObservedAt = conditionsLastObservedAt
   319  	m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
   320  	m.conditionLock.Unlock()
   321  
   322  	for pluginName, threshold := range thresholdsMet {
   323  		if threshold.MetType != pluginapi.ThresholdMetType_HARD_MET {
   324  			general.Infof(" the type: %s of met threshold from plugin: %s isn't  %s", threshold.MetType.String(), pluginName, pluginapi.ThresholdMetType_HARD_MET.String())
   325  			continue
   326  		}
   327  
   328  		m.endpointLock.RLock()
   329  		if m.endpoints[pluginName] == nil {
   330  			general.Errorf(" pluginName points to nil endpoint, can't handle threshold from it")
   331  		}
   332  
   333  		resp, err := m.endpoints[pluginName].GetTopEvictionPods(context.Background(), &pluginapi.GetTopEvictionPodsRequest{
   334  			ActivePods:    pods,
   335  			TopN:          1,
   336  			EvictionScope: threshold.EvictionScope,
   337  		})
   338  
   339  		m.endpointLock.RUnlock()
   340  		if err != nil {
   341  			general.Errorf(" calling GetTopEvictionPods of plugin: %s failed with error: %v", pluginName, err)
   342  			errList = append(errList, err)
   343  			continue
   344  		} else if resp == nil {
   345  			general.Errorf(" calling GetTopEvictionPods of plugin: %s and getting nil resp", pluginName)
   346  			continue
   347  		} else if len(resp.TargetPods) == 0 {
   348  			general.Warningf(" calling GetTopEvictionPods of plugin: %s and getting empty target pods", pluginName)
   349  			continue
   350  		}
   351  
   352  		collector.collectTopEvictionPods(dynamicConfig.DryRun, pluginName, threshold, resp)
   353  	}
   354  
   355  	return collector, errors.NewAggregate(errList)
   356  }
   357  
   358  func (m *EvictionManger) doEvict(softEvictPods, forceEvictPods map[string]*rule.RuledEvictPod) error {
   359  	softEvictPods = filterOutCandidatePodsWithForcePods(softEvictPods, forceEvictPods)
   360  	bestSuitedCandidate := m.getEvictPodFromCandidates(softEvictPods)
   361  	if bestSuitedCandidate != nil && bestSuitedCandidate.Pod != nil {
   362  		general.Infof(" choose best suited pod: %s/%s", bestSuitedCandidate.Pod.Namespace, bestSuitedCandidate.Pod.Name)
   363  		forceEvictPods[string(bestSuitedCandidate.Pod.UID)] = bestSuitedCandidate
   364  	}
   365  
   366  	rpList := rule.RuledEvictPodList{}
   367  	for _, rp := range forceEvictPods {
   368  		if rp != nil && rp.EvictPod.Pod != nil && m.killStrategy.CandidateValidate(rp) {
   369  			general.Infof(" ready to evict %s/%s, reason: %s", rp.Pod.Namespace, rp.Pod.Name, rp.Reason)
   370  			rpList = append(rpList, rp)
   371  		} else {
   372  			general.Warningf(" found nil pod in forceEvictPods")
   373  		}
   374  	}
   375  
   376  	err := m.killWithRules(rpList)
   377  	if err != nil {
   378  		general.Errorf(" got err: %v in EvictPods", err)
   379  		return err
   380  	}
   381  
   382  	general.Infof(" evict %d pods in evictionmanager", len(rpList))
   383  	_ = m.emitter.StoreInt64(MetricsNameVictimPodCNT, int64(len(rpList)), metrics.MetricTypeNameRaw,
   384  		metrics.MetricTag{Key: "type", Val: "total"})
   385  	metricPodsToEvict(m.emitter, rpList, m.conf.GenericConfiguration.QoSConfiguration, m.conf.GenericEvictionConfiguration.PodMetricLabels)
   386  	return nil
   387  }
   388  
   389  // ValidatePlugin validates a plugin if the version is correct and the name has the format of an extended resource
   390  func (m *EvictionManger) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
   391  	general.Infof(" got plugin %s at endpoint %s with versions %v", pluginName, endpoint, versions)
   392  
   393  	if !m.isVersionCompatibleWithPlugin(versions) {
   394  		return fmt.Errorf("manager version, %s, is not among plugin supported versions %v", pluginapi.Version, versions)
   395  	}
   396  
   397  	e, err := endpointpkg.NewRemoteEndpointImpl(endpoint, pluginName)
   398  	if err != nil {
   399  		return fmt.Errorf(" failed to dial resource plugin with socketPath %s: %v", endpoint, err)
   400  	}
   401  	defer e.Stop()
   402  
   403  	// try to push authentication process as far as we can even in non-strict mode, it helps to identify who
   404  	// registers this plugin
   405  	tokenResp, tokenErr := e.GetToken(context.TODO())
   406  	if tokenErr != nil {
   407  		m.emitPluginValidateResult(pluginName, m.conf.StrictAuthentication, false, ValidateFailedReasonGetTokenFailed, UserUnknown)
   408  		if m.conf.StrictAuthentication {
   409  			return fmt.Errorf(" failed to get token:%v", tokenErr)
   410  		}
   411  		general.Warningf("no valid token for plugin %s:%v", pluginName, tokenErr)
   412  		return nil
   413  	}
   414  
   415  	authInfo, authErr := m.cred.AuthToken(tokenResp.Token)
   416  	if authErr != nil {
   417  		m.emitPluginValidateResult(pluginName, m.conf.StrictAuthentication, false, ValidateFailedReasonAuthenticateFailed, UserUnknown)
   418  		if m.conf.StrictAuthentication {
   419  			return fmt.Errorf(" failed to verify token:%v", authErr)
   420  		}
   421  		general.Warningf("failed to verify token for plugin %s:%v", pluginName, authErr)
   422  		return nil
   423  	}
   424  
   425  	general.Infof("user %v request to register plugin %v", authInfo.SubjectName(), pluginName)
   426  
   427  	verifyErr := m.auth.Verify(authInfo, authorization.PermissionTypeEvictionPlugin)
   428  	if verifyErr != nil {
   429  		m.emitPluginValidateResult(pluginName, m.conf.StrictAuthentication, false, ValidateFailedReasonNoPermission, authInfo.SubjectName())
   430  		if m.conf.StrictAuthentication {
   431  			return err
   432  		}
   433  		return nil
   434  	}
   435  
   436  	m.emitPluginValidateResult(pluginName, m.conf.StrictAuthentication, true, "", authInfo.SubjectName())
   437  	return nil
   438  }
   439  
   440  func (m *EvictionManger) emitPluginValidateResult(pluginName string, strict bool, valid bool, reason string, user string) {
   441  	_ = m.emitter.StoreInt64(MetricsNameEvictionPluginValidate, 1, metrics.MetricTypeNameCount,
   442  		metrics.MetricTag{Key: "name", Val: pluginName},
   443  		metrics.MetricTag{Key: "strict", Val: strconv.FormatBool(strict)},
   444  		metrics.MetricTag{Key: "valid", Val: strconv.FormatBool(valid)},
   445  		metrics.MetricTag{Key: "reason", Val: reason},
   446  		metrics.MetricTag{Key: "user", Val: user})
   447  }
   448  
   449  func (m *EvictionManger) RegisterPlugin(pluginName string, endpoint string, _ []string) error {
   450  	general.Infof(" Registering Plugin %s at endpoint %s", pluginName, endpoint)
   451  
   452  	e, err := endpointpkg.NewRemoteEndpointImpl(endpoint, pluginName)
   453  	if err != nil {
   454  		return fmt.Errorf(" failed to dial resource plugin with socketPath %s: %v", endpoint, err)
   455  	}
   456  
   457  	m.registerEndpoint(pluginName, e)
   458  
   459  	return nil
   460  }
   461  
   462  func (m *EvictionManger) DeRegisterPlugin(pluginName string) {
   463  	m.endpointLock.Lock()
   464  	defer m.endpointLock.Unlock()
   465  
   466  	if eI, ok := m.endpoints[pluginName]; ok {
   467  		eI.Stop()
   468  	}
   469  }
   470  
   471  func (m *EvictionManger) GetHandlerType() string {
   472  	return registration.EvictionPlugin
   473  }
   474  
   475  func (m *EvictionManger) registerEndpoint(pluginName string, e endpointpkg.Endpoint) {
   476  	m.endpointLock.Lock()
   477  	defer m.endpointLock.Unlock()
   478  
   479  	old, ok := m.endpoints[pluginName]
   480  	if ok && !old.IsStopped() {
   481  		general.Infof(" stop old endpoint: %s", pluginName)
   482  		old.Stop()
   483  	}
   484  
   485  	m.endpoints[pluginName] = e
   486  	e.Start()
   487  
   488  	general.Infof(" registered endpoint %s", pluginName)
   489  }
   490  
   491  func (m *EvictionManger) isVersionCompatibleWithPlugin(versions []string) bool {
   492  	for _, version := range versions {
   493  		for _, supportedVersion := range pluginapi.SupportedVersions {
   494  			if version == supportedVersion {
   495  				return true
   496  			}
   497  		}
   498  	}
   499  	return false
   500  }
   501  
   502  // killWithRules send killing requests according to pre-defined rules
   503  // currently, we will use FIFO (with rate limiting) to
   504  func (m *EvictionManger) killWithRules(rpList rule.RuledEvictPodList) error {
   505  	// withdraw previous candidate killing pods by set override params as true
   506  	m.killQueue.Add(rpList, true)
   507  	return m.podKiller.EvictPods(m.killQueue.Pop())
   508  }
   509  
   510  // getEvictPodFromCandidates returns the most critical pod to be evicted
   511  func (m *EvictionManger) getEvictPodFromCandidates(candidateEvictPods map[string]*rule.RuledEvictPod) *rule.RuledEvictPod {
   512  	rpList := rule.RuledEvictPodList{}
   513  	for _, rp := range candidateEvictPods {
   514  		// only killing pods that pass candidate validation
   515  		if rp != nil && rp.Pod != nil && m.killStrategy.CandidateValidate(rp) {
   516  			rpList = append(rpList, rp)
   517  		}
   518  	}
   519  	if len(rpList) == 0 {
   520  		return nil
   521  	}
   522  
   523  	m.killStrategy.CandidateSort(rpList)
   524  	return rpList[0]
   525  }
   526  
   527  // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met.
   528  func thresholdsFirstObservedAt(thresholds map[string]*pluginapi.ThresholdMetResponse, lastObservedAt map[string]thresholdObservedAt, now time.Time) map[string]thresholdObservedAt {
   529  	results := make(map[string]thresholdObservedAt)
   530  	for pluginName, threshold := range thresholds {
   531  		if threshold == nil {
   532  			continue
   533  		}
   534  
   535  		observedAt, found := lastObservedAt[pluginName]
   536  		if !found {
   537  			observedAt = thresholdObservedAt{
   538  				timestamp: now,
   539  			}
   540  		}
   541  		observedAt.threshold = proto.Clone(threshold).(*pluginapi.ThresholdMetResponse)
   542  
   543  		results[pluginName] = observedAt
   544  	}
   545  	return results
   546  }
   547  
   548  // conditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met.
   549  func conditionsLastObservedAt(conditions map[string]*pluginapi.Condition, lastObservedAt map[string]conditionObservedAt, now time.Time) map[string]conditionObservedAt {
   550  	results := make(map[string]conditionObservedAt)
   551  
   552  	// the input conditions were observed "now"
   553  	for conditionName, condition := range conditions {
   554  		results[conditionName] = conditionObservedAt{
   555  			condition: proto.Clone(condition).(*pluginapi.Condition),
   556  			timestamp: now,
   557  		}
   558  	}
   559  
   560  	// the conditions that were not observed now are merged in with their old time
   561  	for key, value := range lastObservedAt {
   562  		_, found := results[key]
   563  		if !found {
   564  			results[key] = value
   565  		}
   566  	}
   567  	return results
   568  }
   569  
   570  // conditionsObservedSince returns the set of conditions that have been observed within the specified period
   571  func conditionsObservedSince(conditionsObservedAt map[string]conditionObservedAt, period time.Duration, now time.Time) map[string]*pluginapi.Condition {
   572  	results := make(map[string]*pluginapi.Condition)
   573  
   574  	for conditionName, observedAt := range conditionsObservedAt {
   575  		duration := now.Sub(observedAt.timestamp)
   576  		if duration < period {
   577  			results[conditionName] = proto.Clone(observedAt.condition).(*pluginapi.Condition)
   578  		}
   579  	}
   580  	return results
   581  }
   582  
   583  // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period
   584  func thresholdsMetGracePeriod(thresholdsObservedAt map[string]thresholdObservedAt, now time.Time) map[string]*pluginapi.ThresholdMetResponse {
   585  	results := make(map[string]*pluginapi.ThresholdMetResponse)
   586  
   587  	for pluginName, observedAt := range thresholdsObservedAt {
   588  		if observedAt.threshold == nil {
   589  			general.Errorf(" met nil threshold in observedAt of plugin: %s", pluginName)
   590  			continue
   591  		}
   592  
   593  		duration := now.Sub(observedAt.timestamp)
   594  		if duration.Seconds() < float64(observedAt.threshold.GracePeriodSeconds) {
   595  			general.InfoS(" eviction criteria not yet met", "threshold", observedAt.threshold.String(), "duration", duration)
   596  			continue
   597  		}
   598  		results[pluginName] = proto.Clone(observedAt.threshold).(*pluginapi.ThresholdMetResponse)
   599  	}
   600  	return results
   601  }
   602  
   603  // filterOutCandidatePodsWithForcePods returns candidateEvictPods that are not forced to be evicted
   604  func filterOutCandidatePodsWithForcePods(candidateEvictPods, forceEvictPods map[string]*rule.RuledEvictPod) map[string]*rule.RuledEvictPod {
   605  	ret := make(map[string]*rule.RuledEvictPod)
   606  
   607  	for podUID, pod := range candidateEvictPods {
   608  		if forceEvictPods[podUID] != nil {
   609  			continue
   610  		}
   611  
   612  		ret[podUID] = pod
   613  	}
   614  
   615  	return ret
   616  }
   617  
   618  func logConfirmedConditions(conditions map[string]*pluginapi.Condition) {
   619  	if len(conditions) == 0 {
   620  		general.Infof(" there is no condition confirmed")
   621  	}
   622  
   623  	for _, condition := range conditions {
   624  		if condition == nil {
   625  			continue
   626  		}
   627  
   628  		general.Infof(" confirmed condition: %s", condition.String())
   629  	}
   630  }
   631  
   632  func logConfirmedThresholdMet(thresholds map[string]*pluginapi.ThresholdMetResponse) {
   633  	if len(thresholds) == 0 {
   634  		general.Infof(" there is no met threshold confirmed")
   635  	}
   636  
   637  	for pluginName, threshold := range thresholds {
   638  		if threshold == nil {
   639  			continue
   640  		}
   641  
   642  		general.Infof(" confirmed met threshold: %s from plugin: %s", threshold.String(), pluginName)
   643  	}
   644  }
   645  
   646  func metricPodsToEvict(emitter metrics.MetricEmitter, rpList rule.RuledEvictPodList, qosConfig *generic.QoSConfiguration, podMetricLabels sets.String) {
   647  	if emitter == nil {
   648  		general.Errorf(" metricPodsToEvict got nil emitter")
   649  		return
   650  	}
   651  
   652  	for _, rp := range rpList {
   653  		if rp != nil && rp.EvictionPluginName != "" {
   654  			metricsPodToEvict(emitter, qosConfig, rp.EvictionPluginName, rp.Pod, false, podMetricLabels)
   655  		}
   656  	}
   657  }
   658  
   659  func metricsPodToEvict(emitter metrics.MetricEmitter, qosConfig *generic.QoSConfiguration, pluginName string, pod *v1.Pod, dryRun bool, podMetricLabels sets.String) {
   660  	podQosLevel := "unknown"
   661  	if qosConfig != nil {
   662  		qosLevel, err := qosConfig.GetQoSLevelForPod(pod)
   663  		if err == nil {
   664  			podQosLevel = qosLevel
   665  		}
   666  	}
   667  	metricKey := MetricsNameVictimPodCNT
   668  	if dryRun {
   669  		metricKey = MetricsNameDryRunVictimPodCNT
   670  	}
   671  
   672  	metricTags := []metrics.MetricTag{
   673  		{Key: "name", Val: pluginName},
   674  		{Key: "type", Val: "plugin"},
   675  		{Key: "victim_ns", Val: pod.Namespace},
   676  		{Key: "victim_name", Val: pod.Name},
   677  		{Key: "qos", Val: podQosLevel},
   678  	}
   679  	if pod.Labels != nil {
   680  		for _, metricLabel := range podMetricLabels.List() {
   681  			metricValue, ok := pod.Labels[metricLabel]
   682  			if ok {
   683  				metricTags = append(metricTags, metrics.MetricTag{
   684  					Key: genPodLabelMetricKey(metricLabel),
   685  					Val: metricValue,
   686  				})
   687  			}
   688  		}
   689  	}
   690  	_ = emitter.StoreInt64(metricKey, 1, metrics.MetricTypeNameRaw, metricTags...)
   691  }
   692  
   693  func genPodLabelMetricKey(key string) string {
   694  	key = strings.ReplaceAll(key, "-", "_")
   695  	return strings.Join([]string{MetricsPodLabelPrefix, key}, "_")
   696  }