github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction/strategy/pressure_suppression.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package strategy 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "sync" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/api/resource" 28 29 "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1" 30 pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1" 31 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state" 32 "github.com/kubewharf/katalyst-core/pkg/config" 33 "github.com/kubewharf/katalyst-core/pkg/metaserver" 34 "github.com/kubewharf/katalyst-core/pkg/metrics" 35 "github.com/kubewharf/katalyst-core/pkg/util/general" 36 "github.com/kubewharf/katalyst-core/pkg/util/native" 37 "github.com/kubewharf/katalyst-core/pkg/util/qos" 38 ) 39 40 const EvictionNameSuppression = "cpu-pressure-suppression-plugin" 41 42 type CPUPressureSuppression struct { 43 conf *config.Configuration 44 state state.ReadonlyState 45 46 lastToleranceTime sync.Map 47 } 48 49 func NewCPUPressureSuppressionEviction(_ metrics.MetricEmitter, _ *metaserver.MetaServer, 50 conf *config.Configuration, state state.ReadonlyState, 51 ) (CPUPressureEviction, error) { 52 return &CPUPressureSuppression{ 53 conf: conf, 54 state: state, 55 }, nil 56 } 57 58 func (p *CPUPressureSuppression) Start(context.Context) error { return nil } 59 func (p *CPUPressureSuppression) Name() string { return EvictionNameSuppression } 60 func (p *CPUPressureSuppression) ThresholdMet(_ context.Context, _ *pluginapi.Empty) (*pluginapi.ThresholdMetResponse, error) { 61 return &pluginapi.ThresholdMetResponse{}, nil 62 } 63 64 func (p *CPUPressureSuppression) GetTopEvictionPods(_ context.Context, _ *pluginapi.GetTopEvictionPodsRequest) (*pluginapi.GetTopEvictionPodsResponse, error) { 65 return &pluginapi.GetTopEvictionPodsResponse{}, nil 66 } 67 68 func (p *CPUPressureSuppression) GetEvictPods(_ context.Context, request *pluginapi.GetEvictPodsRequest) (*pluginapi.GetEvictPodsResponse, error) { 69 if request == nil { 70 return nil, fmt.Errorf("GetEvictPods got nil request") 71 } 72 73 dynamicConfig := p.conf.GetDynamicConfiguration() 74 if !dynamicConfig.EnableSuppressionEviction { 75 return &pluginapi.GetEvictPodsResponse{}, nil 76 } 77 general.InfoS("cpu suppression enabled") 78 79 // only reclaim pool support suppression tolerance eviction 80 entries := p.state.GetPodEntries() 81 poolCPUSet, err := entries.GetCPUSetForPool(state.PoolNameReclaim) 82 if err != nil { 83 return nil, fmt.Errorf("get reclaim pool failed: %s", err) 84 } 85 86 // skip evict pods if pool size is zero 87 poolSize := poolCPUSet.Size() 88 if poolSize == 0 { 89 general.Errorf("reclaim pool set size is empty") 90 return &pluginapi.GetEvictPodsResponse{}, nil 91 } 92 93 filteredPods := native.FilterPods(request.ActivePods, p.conf.CheckReclaimedQoSForPod) 94 if len(filteredPods) == 0 { 95 return &pluginapi.GetEvictPodsResponse{}, nil 96 } 97 98 // prioritize evicting the pod whose cpu request is larger and priority is lower 99 general.NewMultiSorter( 100 general.ReverseCmpFunc(native.PodCPURequestCmpFunc), 101 general.ReverseCmpFunc(native.PodPriorityCmpFunc), 102 native.PodUniqKeyCmpFunc, 103 ).Sort(native.NewPodSourceImpList(filteredPods)) 104 105 // sum all pod cpu request 106 totalCPURequest := resource.Quantity{} 107 for _, pod := range filteredPods { 108 totalCPURequest.Add(native.CPUQuantityGetter()(native.SumUpPodRequestResources(pod))) 109 } 110 general.Infof("total reclaim cpu request is %v, reclaim pool size is %v", totalCPURequest.String(), poolSize) 111 112 now := time.Now() 113 var evictPods []*v1alpha1.EvictPod 114 for _, pod := range filteredPods { 115 key := native.GenerateUniqObjectNameKey(pod) 116 poolSuppressionRate := float64(totalCPURequest.Value()) / float64(poolSize) 117 118 if podToleranceRate := p.getPodToleranceRate(pod, dynamicConfig.MaxSuppressionToleranceRate); podToleranceRate < poolSuppressionRate { 119 last, _ := p.lastToleranceTime.LoadOrStore(key, now) 120 lastDuration := now.Sub(last.(time.Time)) 121 general.Infof("current pool suppression rate %.2f, "+ 122 "and it is over than suppression tolerance rate %.2f of pod %s, last duration: %s secs", poolSuppressionRate, 123 podToleranceRate, key, now.Sub(last.(time.Time))) 124 125 // a pod will only be evicted if its cpu suppression lasts longer than minToleranceDuration 126 if lastDuration > dynamicConfig.MinSuppressionToleranceDuration { 127 evictPods = append(evictPods, &v1alpha1.EvictPod{ 128 Pod: pod, 129 Reason: fmt.Sprintf("current pool suppression rate %.2f is over than the "+ 130 "pod suppression tolerance rate %.2f", poolSuppressionRate, podToleranceRate), 131 }) 132 totalCPURequest.Sub(native.CPUQuantityGetter()(native.SumUpPodRequestResources(pod))) 133 } 134 } else { 135 p.lastToleranceTime.Delete(key) 136 } 137 } 138 139 // clear inactive filtered pod from lastToleranceTime 140 filteredPodsMap := native.GetPodNamespaceNameKeyMap(filteredPods) 141 p.lastToleranceTime.Range(func(key, _ interface{}) bool { 142 if _, ok := filteredPodsMap[key.(string)]; !ok { 143 p.lastToleranceTime.Delete(key) 144 } 145 return true 146 }) 147 148 return &pluginapi.GetEvictPodsResponse{EvictPods: evictPods}, nil 149 } 150 151 // getPodToleranceRate returns pod suppression tolerance rate, 152 // and it is limited by max cpu suppression tolerance rate. 153 func (p *CPUPressureSuppression) getPodToleranceRate(pod *v1.Pod, maxToleranceRate float64) float64 { 154 rate, err := qos.GetPodCPUSuppressionToleranceRate(p.conf.QoSConfiguration, pod) 155 if err != nil { 156 general.Errorf("pod %s get cpu suppression tolerance rate failed: %s", 157 native.GenerateUniqObjectNameKey(pod), err) 158 return maxToleranceRate 159 } else { 160 return math.Min(rate, maxToleranceRate) 161 } 162 }