github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/evictionmanager/plugin/memory/rss_overuse.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package memory
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strconv"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/labels"
    27  	"k8s.io/apimachinery/pkg/util/sets"
    28  	"k8s.io/client-go/tools/events"
    29  
    30  	apiconsts "github.com/kubewharf/katalyst-api/pkg/consts"
    31  	pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1"
    32  	"github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin"
    33  	"github.com/kubewharf/katalyst-core/pkg/client"
    34  	"github.com/kubewharf/katalyst-core/pkg/config"
    35  	"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic"
    36  	"github.com/kubewharf/katalyst-core/pkg/config/agent/eviction"
    37  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    38  	"github.com/kubewharf/katalyst-core/pkg/consts"
    39  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    40  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper"
    41  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    42  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    43  	"github.com/kubewharf/katalyst-core/pkg/util/process"
    44  	"github.com/kubewharf/katalyst-core/pkg/util/qos"
    45  )
    46  
    47  const (
    48  	EvictionPluginNameRssOveruse = "rss-overuse-eviction-plugin"
    49  
    50  	RssOveruseEvictionReason = "hit rss overuse policy, threshold is %.2f, current pod rss is %.2f, pod memory request is %d"
    51  )
    52  
    53  func NewRssOveruseEvictionPlugin(_ *client.GenericClientSet, _ events.EventRecorder,
    54  	metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, conf *config.Configuration,
    55  ) plugin.EvictionPlugin {
    56  	return &RssOveruseEvictionPlugin{
    57  		StopControl:        process.NewStopControl(time.Time{}),
    58  		emitter:            emitter,
    59  		reclaimedPodFilter: conf.CheckReclaimedQoSForPod,
    60  		pluginName:         EvictionPluginNameRssOveruse,
    61  		metaServer:         metaServer,
    62  		supportedQosLevels: sets.NewString(apiconsts.PodAnnotationQoSLevelReclaimedCores, apiconsts.PodAnnotationQoSLevelSharedCores),
    63  
    64  		dynamicConfig:  conf.DynamicAgentConfiguration,
    65  		qosConf:        conf.QoSConfiguration,
    66  		evictionConfig: conf.MemoryPressureEvictionConfiguration,
    67  	}
    68  }
    69  
    70  // RssOveruseEvictionPlugin implements the EvictPlugin interface. It triggers pod eviction based on the rss usage ratio.
    71  // Once a pod use more rss than the specified threshold, this plugin will evict the pod.The threshold is calculated based
    72  // on pod's memory request. Its main goal is to make sure sufficient memory for page cache in some scenarios in which
    73  // service use page cache to improve performance.
    74  type RssOveruseEvictionPlugin struct {
    75  	*process.StopControl
    76  
    77  	emitter            metrics.MetricEmitter
    78  	reclaimedPodFilter func(pod *v1.Pod) (bool, error)
    79  	pluginName         string
    80  	metaServer         *metaserver.MetaServer
    81  	supportedQosLevels sets.String
    82  
    83  	dynamicConfig  *dynamic.DynamicAgentConfiguration
    84  	qosConf        *generic.QoSConfiguration
    85  	evictionConfig *eviction.MemoryPressureEvictionConfiguration
    86  }
    87  
    88  func (r *RssOveruseEvictionPlugin) Name() string {
    89  	if r == nil {
    90  		return ""
    91  	}
    92  
    93  	return r.pluginName
    94  }
    95  
    96  func (r *RssOveruseEvictionPlugin) ThresholdMet(_ context.Context) (*pluginapi.ThresholdMetResponse, error) {
    97  	return &pluginapi.ThresholdMetResponse{
    98  		MetType: pluginapi.ThresholdMetType_NOT_MET,
    99  	}, nil
   100  }
   101  
   102  func (r *RssOveruseEvictionPlugin) GetTopEvictionPods(_ context.Context, _ *pluginapi.GetTopEvictionPodsRequest) (*pluginapi.GetTopEvictionPodsResponse, error) {
   103  	return &pluginapi.GetTopEvictionPodsResponse{}, nil
   104  }
   105  
   106  func (r *RssOveruseEvictionPlugin) GetEvictPods(_ context.Context, request *pluginapi.GetEvictPodsRequest) (*pluginapi.GetEvictPodsResponse, error) {
   107  	result := make([]*pluginapi.EvictPod, 0)
   108  
   109  	dynamicConfig := r.dynamicConfig.GetDynamicConfiguration()
   110  	if !dynamicConfig.EnableRSSOveruseEviction {
   111  		return &pluginapi.GetEvictPodsResponse{EvictPods: result}, nil
   112  	}
   113  
   114  	filterPods := make([]*v1.Pod, 0, len(request.ActivePods))
   115  	selector := r.evictionConfig.RSSOveruseEvictionFilter.AsSelector()
   116  
   117  	for i := range request.ActivePods {
   118  		pod := request.ActivePods[i]
   119  		set := (labels.Set)(pod.Labels)
   120  		if selector.Matches(set) {
   121  			filterPods = append(filterPods, pod)
   122  		}
   123  	}
   124  
   125  	for i := range filterPods {
   126  		pod := filterPods[i]
   127  
   128  		qosLevel, err := r.qosConf.GetQoSLevelForPod(pod)
   129  		if err != nil {
   130  			general.Errorf("get qos level failed for pod %+v/%+v, skip check rss overuse, err: %v", pod.Namespace, pod.Name, err)
   131  			continue
   132  		}
   133  
   134  		if !r.supportedQosLevels.Has(qosLevel) {
   135  			continue
   136  		}
   137  
   138  		userSpecifiedThreshold, invalid := qos.GetRSSOverUseEvictThreshold(r.qosConf, pod)
   139  		// don't perform eviction for safety if user set an invalid threshold
   140  		if invalid {
   141  			general.Warningf("pod %+v/%+v set invalid overuse eviction threshold, skip check rss overuse", pod.Namespace, pod.Name)
   142  			continue
   143  		}
   144  
   145  		threshold := dynamicConfig.RSSOveruseRateThreshold
   146  		// user set threshold explicitly,use default value
   147  		if userSpecifiedThreshold != nil {
   148  			threshold = *userSpecifiedThreshold
   149  		}
   150  
   151  		var memRequest int64 = 0
   152  		requestNotSet := false
   153  		for _, container := range pod.Spec.Containers {
   154  			containerMemRequest := container.Resources.Requests.Memory()
   155  			if containerMemRequest.IsZero() {
   156  				requestNotSet = true
   157  				continue
   158  			}
   159  			memRequest += containerMemRequest.Value()
   160  		}
   161  
   162  		// if there is at least one container without memory limit, skip it
   163  		if requestNotSet {
   164  			continue
   165  		}
   166  
   167  		podRss, err := helper.GetPodMetric(r.metaServer.MetricsFetcher, r.emitter, pod, consts.MetricMemRssContainer, nonExistNumaID)
   168  		if err != nil {
   169  			_ = r.emitter.StoreInt64(metricsNameFetchMetricError, 1, metrics.MetricTypeNameCount,
   170  				metrics.ConvertMapToTags(map[string]string{
   171  					metricsTagKeyNumaID: strconv.Itoa(nonExistNumaID),
   172  				})...)
   173  			continue
   174  		}
   175  
   176  		if podRss > threshold*float64(memRequest) {
   177  			result = append(result, &pluginapi.EvictPod{
   178  				Pod:        pod,
   179  				Reason:     fmt.Sprintf(RssOveruseEvictionReason, threshold, podRss, memRequest),
   180  				ForceEvict: false,
   181  			})
   182  		}
   183  	}
   184  
   185  	return &pluginapi.GetEvictPodsResponse{EvictPods: result}, nil
   186  }
   187  
   188  func (r *RssOveruseEvictionPlugin) Start() {
   189  	return
   190  }