github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/provisionpolicy/policy_rama.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package provisionpolicy 18 19 import ( 20 "fmt" 21 "math" 22 23 "k8s.io/apimachinery/pkg/util/errors" 24 "k8s.io/klog/v2" 25 26 apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" 27 "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" 28 "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper" 29 "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" 30 "github.com/kubewharf/katalyst-core/pkg/config" 31 "github.com/kubewharf/katalyst-core/pkg/consts" 32 "github.com/kubewharf/katalyst-core/pkg/metaserver" 33 "github.com/kubewharf/katalyst-core/pkg/metrics" 34 "github.com/kubewharf/katalyst-core/pkg/util/machine" 35 ) 36 37 const ( 38 metricRamaDominantIndicator = "rama_dominant_indicator" 39 ) 40 41 type PolicyRama struct { 42 *PolicyBase 43 conf *config.Configuration 44 controllers map[string]*helper.PIDController // map[metricName]controller 45 } 46 47 func NewPolicyRama(regionName string, regionType types.QoSRegionType, ownerPoolName string, 48 conf *config.Configuration, _ interface{}, metaReader metacache.MetaReader, 49 metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, 50 ) ProvisionPolicy { 51 p := &PolicyRama{ 52 conf: conf, 53 PolicyBase: NewPolicyBase(regionName, regionType, ownerPoolName, metaReader, metaServer, emitter), 54 controllers: make(map[string]*helper.PIDController), 55 } 56 57 return p 58 } 59 60 func (p *PolicyRama) Update() error { 61 // sanity check 62 if err := p.sanityCheck(); err != nil { 63 return err 64 } 65 66 cpuSize := p.ControlKnobs[types.ControlKnobNonReclaimedCPUSize].Value 67 68 cpuAdjustedRaw := math.Inf(-1) 69 dominantIndicator := "unknown" 70 71 // run pid control for each indicator 72 for metricName, indicator := range p.Indicators { 73 params, ok := p.conf.PolicyRama.PIDParameters[metricName] 74 if !ok { 75 klog.Warningf("[qosaware-cpu-rama] pid parameter not found for indicator %v", metricName) 76 continue 77 } 78 79 controller, ok := p.controllers[metricName] 80 if !ok { 81 controller = helper.NewPIDController(metricName, params) 82 p.controllers[metricName] = controller 83 } 84 85 controller.SetEssentials(p.ResourceEssentials) 86 cpuAdjusted := controller.Adjust(cpuSize, indicator.Target, indicator.Current) 87 88 if cpuAdjusted > cpuAdjustedRaw { 89 cpuAdjustedRaw = cpuAdjusted 90 dominantIndicator = metricName 91 } 92 } 93 94 period := p.conf.QoSAwarePluginConfiguration.SyncPeriod 95 _ = p.emitter.StoreInt64(metricRamaDominantIndicator, int64(period.Seconds()), metrics.MetricTypeNameCount, []metrics.MetricTag{ 96 {Key: "metric_name", Val: dominantIndicator}, 97 }...) 98 99 for metricName := range p.controllers { 100 _, ok := p.conf.PolicyRama.PIDParameters[metricName] 101 if !ok { 102 delete(p.controllers, metricName) 103 } 104 } 105 106 cpuAdjustedRestricted := cpuAdjustedRaw 107 108 // restrict cpu size adjusted 109 if p.ControlEssentials.ReclaimOverlap { 110 reclaimedUsage, reclaimedCnt := p.getReclaimStatus() 111 klog.Infof("[qosaware-cpu-rama] reclaim usage %.2f #container %v", reclaimedUsage, reclaimedCnt) 112 113 reason := "" 114 if reclaimedCnt <= 0 { 115 // do not reclaim if no reclaimed containers 116 cpuAdjustedRestricted = p.ResourceUpperBound 117 reason = "no reclaimed container" 118 } else { 119 // do not overlap more if reclaim usage is below threshold 120 threshold := p.ResourceUpperBound - reclaimedUsage - types.ReclaimUsageMarginForOverlap 121 cpuAdjustedRestricted = math.Max(cpuAdjustedRestricted, threshold) 122 reason = "low reclaim usage" 123 } 124 if cpuAdjustedRestricted != cpuAdjustedRaw { 125 klog.Infof("[qosaware-cpu-rama] restrict cpu adjusted from %.2f to %.2f, reason: %v", cpuAdjustedRaw, cpuAdjustedRestricted, reason) 126 } 127 } 128 129 p.controlKnobAdjusted = types.ControlKnob{ 130 types.ControlKnobNonReclaimedCPUSize: types.ControlKnobValue{ 131 Value: cpuAdjustedRestricted, 132 Action: types.ControlKnobActionNone, 133 }, 134 } 135 136 return nil 137 } 138 139 func (p *PolicyRama) sanityCheck() error { 140 var ( 141 isLegal bool 142 errList []error 143 ) 144 145 enableReclaim := p.conf.GetDynamicConfiguration().EnableReclaim 146 147 // 1. check if enable reclaim 148 if !enableReclaim { 149 errList = append(errList, fmt.Errorf("reclaim disabled")) 150 } 151 152 // 2. check margin. skip update when margin is non zero 153 if p.ResourceEssentials.ReservedForAllocate != 0 { 154 errList = append(errList, fmt.Errorf("margin exists")) 155 } 156 157 // 3. check control knob legality 158 isLegal = true 159 if p.ControlKnobs == nil || len(p.ControlKnobs) <= 0 { 160 isLegal = false 161 } else { 162 v, ok := p.ControlKnobs[types.ControlKnobNonReclaimedCPUSize] 163 if !ok || v.Value <= 0 { 164 isLegal = false 165 } 166 } 167 if !isLegal { 168 errList = append(errList, fmt.Errorf("illegal control knob %v", p.ControlKnobs)) 169 } 170 171 // 4. check indicators legality 172 if p.Indicators == nil { 173 errList = append(errList, fmt.Errorf("illegal indicators")) 174 } 175 176 return errors.NewAggregate(errList) 177 } 178 179 func (p *PolicyRama) getReclaimStatus() (usage float64, cnt int) { 180 usage = 0 181 cnt = 0 182 183 f := func(podUID string, containerName string, ci *types.ContainerInfo) bool { 184 if ci.QoSLevel != apiconsts.PodAnnotationQoSLevelReclaimedCores { 185 return true 186 } 187 188 containerUsage := ci.CPURequest 189 m, err := p.metaServer.GetContainerMetric(podUID, containerName, consts.MetricCPUUsageContainer) 190 if err == nil { 191 containerUsage = m.Value 192 } 193 194 // FIXME: metric server doesn't support to report cpu usage in numa granularity, 195 // so we split cpu usage evenly across the binding numas of container. 196 if p.bindingNumas.Size() > 0 { 197 cpuSize := 0 198 for _, numaID := range p.bindingNumas.ToSliceInt() { 199 cpuSize += ci.TopologyAwareAssignments[numaID].Size() 200 } 201 containerUsageNuma := 0.0 202 cpuAssignmentCPUs := machine.CountCPUAssignmentCPUs(ci.TopologyAwareAssignments) 203 if cpuAssignmentCPUs != 0 { 204 containerUsageNuma = containerUsage * float64(cpuSize) / float64(cpuAssignmentCPUs) 205 } else { 206 // handle the case that cpuAssignmentCPUs is 0 207 klog.Warningf("[qosaware-cpu-rama] cpuAssignmentCPUs is 0 for %v/%v", podUID, containerName) 208 containerUsageNuma = 0 209 } 210 usage += containerUsageNuma 211 } 212 213 cnt += 1 214 return true 215 } 216 p.metaReader.RangeContainer(f) 217 218 return usage, cnt 219 }