github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package realtime 18 19 import ( 20 "context" 21 "fmt" 22 "strconv" 23 "sync" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/api/resource" 28 "k8s.io/apimachinery/pkg/util/wait" 29 "k8s.io/klog/v2" 30 31 apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" 32 "github.com/kubewharf/katalyst-core/pkg/config" 33 "github.com/kubewharf/katalyst-core/pkg/consts" 34 "github.com/kubewharf/katalyst-core/pkg/metaserver" 35 "github.com/kubewharf/katalyst-core/pkg/metrics" 36 "github.com/kubewharf/katalyst-core/pkg/util/general" 37 utilkubeconfig "github.com/kubewharf/katalyst-core/pkg/util/kubelet/config" 38 "github.com/kubewharf/katalyst-core/pkg/util/metric" 39 "github.com/kubewharf/katalyst-core/pkg/util/native" 40 ) 41 42 const ( 43 realtimeOvercommitAdvisorUpdateFail = "realtime_overcommit_advisor_update_fail" 44 realtimeOvercommitAdvisorSyncNodeFail = "realtime_overcommit_advisor_sync_node_fail" 45 ) 46 47 var ( 48 cpuMetricsToGather = []string{ 49 consts.MetricCPUUsageContainer, 50 consts.MetricLoad1MinContainer, 51 consts.MetricLoad5MinContainer, 52 } 53 54 memoryMetricsToGather = []string{ 55 consts.MetricMemRssContainer, 56 } 57 ) 58 59 // RealtimeOvercommitmentAdvisor calculate node CPU and memory overcommitment ratio 60 // by realtime metrics and node requested resources from metaSever 61 type RealtimeOvercommitmentAdvisor struct { 62 mutex sync.RWMutex 63 64 metaServer *metaserver.MetaServer 65 emitter metrics.MetricEmitter 66 67 updatePeriod time.Duration 68 syncPodTimeout time.Duration 69 70 nodeTargetCPULoad float64 71 nodeTargetMemoryLoad float64 72 podEstimatedCPULoad float64 73 podEstimatedMemoryLoad float64 74 75 cpuMetricsToGather []string 76 memoryMetricsToGather []string 77 78 resourceOvercommitRatio map[v1.ResourceName]float64 79 resourceAllocatable map[v1.ResourceName]resource.Quantity 80 } 81 82 type PodResourceInfo struct { 83 usage float64 84 request resource.Quantity 85 limit resource.Quantity 86 } 87 88 func NewRealtimeOvercommitmentAdvisor( 89 conf *config.Configuration, 90 metaServer *metaserver.MetaServer, 91 emitter metrics.MetricEmitter, 92 ) *RealtimeOvercommitmentAdvisor { 93 ra := &RealtimeOvercommitmentAdvisor{ 94 metaServer: metaServer, 95 emitter: emitter, 96 97 resourceOvercommitRatio: map[v1.ResourceName]float64{ 98 v1.ResourceCPU: 1.0, 99 v1.ResourceMemory: 1.0, 100 }, 101 resourceAllocatable: map[v1.ResourceName]resource.Quantity{}, 102 103 updatePeriod: conf.OvercommitAwarePluginConfiguration.SyncPeriod, 104 syncPodTimeout: conf.SyncPodTimeout, 105 nodeTargetCPULoad: conf.TargetCPULoad, 106 nodeTargetMemoryLoad: conf.TargetMemoryLoad, 107 podEstimatedCPULoad: conf.EstimatedPodCPULoad, 108 podEstimatedMemoryLoad: conf.EstimatedPodMemoryLoad, 109 cpuMetricsToGather: conf.CPUMetricsToGather, 110 memoryMetricsToGather: conf.MemoryMetricsToGather, 111 } 112 113 err := ra.syncAllocatableResource() 114 if err != nil { 115 klog.Fatalf("syncAllocatableResource fail: %v", err) 116 } 117 118 return ra 119 } 120 121 func (ra *RealtimeOvercommitmentAdvisor) Run(ctx context.Context) { 122 klog.Infof("RealtimeOvercommitmentAdvisor run...") 123 124 go wait.Until(func() { 125 err := ra.syncAllocatableResource() 126 if err != nil { 127 klog.Errorf("syncAllocatableResource fail: %v", err) 128 _ = ra.emitter.StoreInt64(realtimeOvercommitAdvisorSyncNodeFail, 1, metrics.MetricTypeNameCount) 129 } 130 }, time.Hour, ctx.Done()) 131 132 go wait.Until(func() { 133 err := ra.update() 134 if err != nil { 135 klog.Errorf("RealtimeOvercommitmentAdvisor update fail: %v", err) 136 _ = ra.emitter.StoreInt64(realtimeOvercommitAdvisorUpdateFail, 1, metrics.MetricTypeNameCount) 137 } 138 }, ra.updatePeriod, ctx.Done()) 139 } 140 141 func (ra *RealtimeOvercommitmentAdvisor) update() error { 142 // list pod from metaServer 143 ctx, cancel := context.WithTimeout(context.Background(), ra.syncPodTimeout) 144 defer cancel() 145 podList, err := ra.metaServer.GetPodList(ctx, nil) 146 if err != nil { 147 err = fmt.Errorf("[overcommitment-aware-realtime] list pod fail, err: %v", err) 148 klog.Error(err) 149 return err 150 } 151 152 // sum node request resource 153 nodeResourceRequest := sumUpPodsResources(podList) 154 klog.V(6).Infof("[overcommitment-aware-realtime] sumUpPodsResources, cpu: %v, memory: %v", nodeResourceRequest.Cpu().String(), nodeResourceRequest.Memory().String()) 155 156 // agg node pods usage 157 nodeResourceUsage := ra.aggregateNodeMetrics(podList) 158 klog.V(6).Infof("[overcommitment-aware-realtime] aggregateNodeMetrics: %v", nodeResourceUsage) 159 160 ra.metricsToOvercommitRatio(nodeResourceRequest, nodeResourceUsage) 161 162 return nil 163 } 164 165 func (ra *RealtimeOvercommitmentAdvisor) aggregateNodeMetrics(podList []*v1.Pod) map[v1.ResourceName]float64 { 166 var cpuUsage, memoryUsage float64 167 168 if len(ra.cpuMetricsToGather) != 0 { 169 cpuUsage = ra.aggregateMetrics(podList, ra.cpuMetricsToGather) 170 } else { 171 cpuUsage = ra.aggregateMetrics(podList, cpuMetricsToGather) 172 } 173 174 if len(ra.memoryMetricsToGather) != 0 { 175 memoryUsage = ra.aggregateMetrics(podList, ra.memoryMetricsToGather) 176 } else { 177 memoryUsage = ra.aggregateMetrics(podList, memoryMetricsToGather) 178 } 179 180 return map[v1.ResourceName]float64{ 181 v1.ResourceCPU: cpuUsage, 182 v1.ResourceMemory: memoryUsage, 183 } 184 } 185 186 func (ra *RealtimeOvercommitmentAdvisor) aggregateMetrics(podList []*v1.Pod, metrics []string) float64 { 187 var ( 188 res float64 189 metricValue float64 190 reference string 191 ) 192 193 for _, pod := range podList { 194 metricValue = 0 195 reference = "" 196 197 for _, metricName := range metrics { 198 metricData := ra.metaServer.AggregatePodMetric([]*v1.Pod{pod}, metricName, metric.AggregatorSum, metric.DefaultContainerMetricFilter) 199 if klog.V(5).Enabled() { 200 general.Infof("pod %v metric %v value %v", pod.Name, metricName, metricData.Value) 201 } 202 if metricData.Value <= 0 { 203 continue 204 } 205 206 if metricData.Value > metricValue { 207 metricValue = metricData.Value 208 reference = metricName 209 } 210 } 211 212 if klog.V(5).Enabled() { 213 general.Infof("pod %v aggregateCPU value %v reference %v", pod.Name, metricValue, reference) 214 } 215 res += metricValue 216 } 217 218 return res 219 } 220 221 func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableResource() error { 222 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 223 defer cancel() 224 225 kconfig, err := ra.metaServer.GetKubeletConfig(ctx) 226 if err != nil { 227 klog.Errorf("get kubeletconfig fail: %v", err) 228 return err 229 } 230 231 reservedCPU, found, err := utilkubeconfig.GetReservedQuantity(kconfig, string(v1.ResourceCPU)) 232 if err != nil { 233 klog.Errorf("GetKubeletReservedQuantity fail: %v", err) 234 return err 235 } else if !found { 236 reservedCPU = *resource.NewQuantity(0, resource.DecimalSI) 237 } 238 239 reservedMemory, found, err := utilkubeconfig.GetReservedQuantity(kconfig, string(v1.ResourceMemory)) 240 if err != nil { 241 klog.Errorf("GetKubeletReservedQuantity fail: %v", err) 242 return err 243 } else if !found { 244 reservedMemory = *resource.NewQuantity(0, resource.BinarySI) 245 } 246 247 ra.syncAllocatableCPU(reservedCPU) 248 ra.syncAllocatableMemory(reservedMemory) 249 return nil 250 } 251 252 func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableCPU(reserved resource.Quantity) { 253 capacity := resource.NewMilliQuantity(int64(ra.metaServer.MachineInfo.NumCores*1000), resource.DecimalSI) 254 capacity.Sub(reserved) 255 256 ra.mutex.Lock() 257 ra.resourceAllocatable[v1.ResourceCPU] = *capacity 258 ra.mutex.Unlock() 259 260 klog.V(5).Infof("node allocatable cpu %v, reserved cpu %v", capacity.String(), reserved.String()) 261 } 262 263 func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableMemory(reserved resource.Quantity) { 264 capacity := resource.NewQuantity(int64(ra.metaServer.MemoryCapacity), resource.BinarySI) 265 266 capacity.Sub(reserved) 267 268 ra.mutex.Lock() 269 ra.resourceAllocatable[v1.ResourceMemory] = *capacity 270 ra.mutex.Unlock() 271 272 klog.V(5).Infof("node allocatable memory %v, reserved memory %v", capacity.String(), reserved.String()) 273 } 274 275 func (ra *RealtimeOvercommitmentAdvisor) metricsToOvercommitRatio(resourceRequest v1.ResourceList, resourceUsage map[v1.ResourceName]float64) { 276 cpuOvercommitRatio := ra.resourceMetricsToOvercommitRatio(v1.ResourceCPU, *resourceRequest.Cpu(), resourceUsage[v1.ResourceCPU]) 277 278 memoryOvercommitRatio := ra.resourceMetricsToOvercommitRatio(v1.ResourceMemory, *resourceRequest.Memory(), resourceUsage[v1.ResourceMemory]) 279 280 ra.mutex.Lock() 281 ra.resourceOvercommitRatio[v1.ResourceCPU] = cpuOvercommitRatio 282 ra.resourceOvercommitRatio[v1.ResourceMemory] = memoryOvercommitRatio 283 ra.mutex.Unlock() 284 } 285 286 func (ra *RealtimeOvercommitmentAdvisor) resourceMetricsToOvercommitRatio(resourceName v1.ResourceName, resourceRequest resource.Quantity, usage float64) float64 { 287 ra.mutex.RLock() 288 resourceAllocatable, ok := ra.resourceAllocatable[resourceName] 289 ra.mutex.RUnlock() 290 291 if !ok { 292 klog.Errorf("resource %v not exist in resourceAllocatable map", resourceName) 293 return 1.0 294 } 295 296 allocatable := resourceAllocatable.MilliValue() 297 request := resourceRequest.MilliValue() 298 usage = usage * 1000 299 300 if request == 0 || allocatable == 0 { 301 klog.Warningf("unexpected node resource, resourceName: %v, request: %v, allocatable: %v", resourceName, request, allocatable) 302 return 1.0 303 } 304 305 existedPodLoad := usage / float64(request) 306 if existedPodLoad > 1 { 307 existedPodLoad = 1 308 } 309 var podExpectedLoad, nodeTargetLoad float64 310 switch resourceName { 311 case v1.ResourceCPU: 312 podExpectedLoad = ra.podEstimatedCPULoad 313 nodeTargetLoad = ra.nodeTargetCPULoad 314 case v1.ResourceMemory: 315 podExpectedLoad = ra.podEstimatedMemoryLoad 316 nodeTargetLoad = ra.nodeTargetMemoryLoad 317 default: 318 klog.Warningf("unknow resourceName: %v", resourceName) 319 return 1.0 320 } 321 if existedPodLoad < podExpectedLoad { 322 existedPodLoad = podExpectedLoad 323 } 324 325 overcommitRatio := ((float64(allocatable)*nodeTargetLoad-usage)/existedPodLoad + float64(request)) / float64(allocatable) 326 327 klog.V(5).Infof("resource %v request: %v, allocatable: %v, usage: %v, targetLoad: %v, existLoad: %v, overcommitRatio: %v", 328 resourceName, request, allocatable, usage, nodeTargetLoad, existedPodLoad, overcommitRatio) 329 if overcommitRatio < 1.0 { 330 overcommitRatio = 1.0 331 } 332 return overcommitRatio 333 } 334 335 func sumUpPodsResources(podList []*v1.Pod) v1.ResourceList { 336 var ( 337 podsCPURequest = resource.NewQuantity(0, resource.DecimalSI) 338 podsMemoryRequest = resource.NewQuantity(0, resource.BinarySI) 339 ) 340 341 for _, pod := range podList { 342 podResource := native.SumUpPodRequestResources(pod) 343 344 cpuRequest := podResource.Cpu() 345 memoryRequest := podResource.Memory() 346 347 podsCPURequest.Add(*cpuRequest) 348 podsMemoryRequest.Add(*memoryRequest) 349 } 350 351 return v1.ResourceList{ 352 v1.ResourceCPU: podsCPURequest.DeepCopy(), 353 v1.ResourceMemory: podsMemoryRequest.DeepCopy(), 354 } 355 } 356 357 func (ra *RealtimeOvercommitmentAdvisor) GetOvercommitRatio() (map[v1.ResourceName]float64, error) { 358 res := map[v1.ResourceName]float64{ 359 v1.ResourceCPU: 1.0, 360 v1.ResourceMemory: 1.0, 361 } 362 363 ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) 364 defer cancel() 365 node, err := ra.metaServer.GetNode(ctx) 366 if err != nil { 367 klog.Error("GetOvercommitRatio getNode fail: %v", err) 368 return nil, err 369 } 370 if cpuOvercommitRatioAnno, ok := node.Annotations[apiconsts.NodeAnnotationCPUOvercommitRatioKey]; ok { 371 cpuOvercommitRatio, err := strconv.ParseFloat(cpuOvercommitRatioAnno, 64) 372 if err != nil { 373 klog.Errorf("%s parse fail: %v", cpuOvercommitRatioAnno, err) 374 } else { 375 res[v1.ResourceCPU] = cpuOvercommitRatio 376 } 377 } 378 if memOvercommitRatioAnno, ok := node.Annotations[apiconsts.NodeAnnotationMemoryOvercommitRatioKey]; ok { 379 memOvercommitRatio, err := strconv.ParseFloat(memOvercommitRatioAnno, 64) 380 if err != nil { 381 klog.Errorf("%s parse fail: %v", memOvercommitRatioAnno, err) 382 } else { 383 res[v1.ResourceMemory] = memOvercommitRatio 384 } 385 } 386 387 ra.mutex.RLock() 388 defer ra.mutex.RUnlock() 389 390 if len(ra.resourceOvercommitRatio) <= 0 { 391 return map[v1.ResourceName]float64{}, nil 392 } 393 394 // only report when overcommit ratio less than the set value 395 for resourceName, overcommitRatio := range ra.resourceOvercommitRatio { 396 if overcommitRatio >= res[resourceName] { 397 continue 398 } 399 res[resourceName] = overcommitRatio 400 } 401 402 return res, nil 403 }