github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/sysadvisor/plugin/overcommitmentaware/realtime/realtime.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package realtime
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strconv"
    23  	"sync"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/api/resource"
    28  	"k8s.io/apimachinery/pkg/util/wait"
    29  	"k8s.io/klog/v2"
    30  
    31  	apiconsts "github.com/kubewharf/katalyst-api/pkg/consts"
    32  	"github.com/kubewharf/katalyst-core/pkg/config"
    33  	"github.com/kubewharf/katalyst-core/pkg/consts"
    34  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    35  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    36  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    37  	utilkubeconfig "github.com/kubewharf/katalyst-core/pkg/util/kubelet/config"
    38  	"github.com/kubewharf/katalyst-core/pkg/util/metric"
    39  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    40  )
    41  
    42  const (
    43  	realtimeOvercommitAdvisorUpdateFail   = "realtime_overcommit_advisor_update_fail"
    44  	realtimeOvercommitAdvisorSyncNodeFail = "realtime_overcommit_advisor_sync_node_fail"
    45  )
    46  
    47  var (
    48  	cpuMetricsToGather = []string{
    49  		consts.MetricCPUUsageContainer,
    50  		consts.MetricLoad1MinContainer,
    51  		consts.MetricLoad5MinContainer,
    52  	}
    53  
    54  	memoryMetricsToGather = []string{
    55  		consts.MetricMemRssContainer,
    56  	}
    57  )
    58  
    59  // RealtimeOvercommitmentAdvisor calculate node CPU and memory overcommitment ratio
    60  // by realtime metrics and node requested resources from metaSever
    61  type RealtimeOvercommitmentAdvisor struct {
    62  	mutex sync.RWMutex
    63  
    64  	metaServer *metaserver.MetaServer
    65  	emitter    metrics.MetricEmitter
    66  
    67  	updatePeriod   time.Duration
    68  	syncPodTimeout time.Duration
    69  
    70  	nodeTargetCPULoad      float64
    71  	nodeTargetMemoryLoad   float64
    72  	podEstimatedCPULoad    float64
    73  	podEstimatedMemoryLoad float64
    74  
    75  	cpuMetricsToGather    []string
    76  	memoryMetricsToGather []string
    77  
    78  	resourceOvercommitRatio map[v1.ResourceName]float64
    79  	resourceAllocatable     map[v1.ResourceName]resource.Quantity
    80  }
    81  
    82  type PodResourceInfo struct {
    83  	usage   float64
    84  	request resource.Quantity
    85  	limit   resource.Quantity
    86  }
    87  
    88  func NewRealtimeOvercommitmentAdvisor(
    89  	conf *config.Configuration,
    90  	metaServer *metaserver.MetaServer,
    91  	emitter metrics.MetricEmitter,
    92  ) *RealtimeOvercommitmentAdvisor {
    93  	ra := &RealtimeOvercommitmentAdvisor{
    94  		metaServer: metaServer,
    95  		emitter:    emitter,
    96  
    97  		resourceOvercommitRatio: map[v1.ResourceName]float64{
    98  			v1.ResourceCPU:    1.0,
    99  			v1.ResourceMemory: 1.0,
   100  		},
   101  		resourceAllocatable: map[v1.ResourceName]resource.Quantity{},
   102  
   103  		updatePeriod:           conf.OvercommitAwarePluginConfiguration.SyncPeriod,
   104  		syncPodTimeout:         conf.SyncPodTimeout,
   105  		nodeTargetCPULoad:      conf.TargetCPULoad,
   106  		nodeTargetMemoryLoad:   conf.TargetMemoryLoad,
   107  		podEstimatedCPULoad:    conf.EstimatedPodCPULoad,
   108  		podEstimatedMemoryLoad: conf.EstimatedPodMemoryLoad,
   109  		cpuMetricsToGather:     conf.CPUMetricsToGather,
   110  		memoryMetricsToGather:  conf.MemoryMetricsToGather,
   111  	}
   112  
   113  	err := ra.syncAllocatableResource()
   114  	if err != nil {
   115  		klog.Fatalf("syncAllocatableResource fail: %v", err)
   116  	}
   117  
   118  	return ra
   119  }
   120  
   121  func (ra *RealtimeOvercommitmentAdvisor) Run(ctx context.Context) {
   122  	klog.Infof("RealtimeOvercommitmentAdvisor run...")
   123  
   124  	go wait.Until(func() {
   125  		err := ra.syncAllocatableResource()
   126  		if err != nil {
   127  			klog.Errorf("syncAllocatableResource fail: %v", err)
   128  			_ = ra.emitter.StoreInt64(realtimeOvercommitAdvisorSyncNodeFail, 1, metrics.MetricTypeNameCount)
   129  		}
   130  	}, time.Hour, ctx.Done())
   131  
   132  	go wait.Until(func() {
   133  		err := ra.update()
   134  		if err != nil {
   135  			klog.Errorf("RealtimeOvercommitmentAdvisor update fail: %v", err)
   136  			_ = ra.emitter.StoreInt64(realtimeOvercommitAdvisorUpdateFail, 1, metrics.MetricTypeNameCount)
   137  		}
   138  	}, ra.updatePeriod, ctx.Done())
   139  }
   140  
   141  func (ra *RealtimeOvercommitmentAdvisor) update() error {
   142  	// list pod from metaServer
   143  	ctx, cancel := context.WithTimeout(context.Background(), ra.syncPodTimeout)
   144  	defer cancel()
   145  	podList, err := ra.metaServer.GetPodList(ctx, nil)
   146  	if err != nil {
   147  		err = fmt.Errorf("[overcommitment-aware-realtime] list pod fail, err: %v", err)
   148  		klog.Error(err)
   149  		return err
   150  	}
   151  
   152  	// sum node request resource
   153  	nodeResourceRequest := sumUpPodsResources(podList)
   154  	klog.V(6).Infof("[overcommitment-aware-realtime] sumUpPodsResources, cpu: %v, memory: %v", nodeResourceRequest.Cpu().String(), nodeResourceRequest.Memory().String())
   155  
   156  	// agg node pods usage
   157  	nodeResourceUsage := ra.aggregateNodeMetrics(podList)
   158  	klog.V(6).Infof("[overcommitment-aware-realtime] aggregateNodeMetrics: %v", nodeResourceUsage)
   159  
   160  	ra.metricsToOvercommitRatio(nodeResourceRequest, nodeResourceUsage)
   161  
   162  	return nil
   163  }
   164  
   165  func (ra *RealtimeOvercommitmentAdvisor) aggregateNodeMetrics(podList []*v1.Pod) map[v1.ResourceName]float64 {
   166  	var cpuUsage, memoryUsage float64
   167  
   168  	if len(ra.cpuMetricsToGather) != 0 {
   169  		cpuUsage = ra.aggregateMetrics(podList, ra.cpuMetricsToGather)
   170  	} else {
   171  		cpuUsage = ra.aggregateMetrics(podList, cpuMetricsToGather)
   172  	}
   173  
   174  	if len(ra.memoryMetricsToGather) != 0 {
   175  		memoryUsage = ra.aggregateMetrics(podList, ra.memoryMetricsToGather)
   176  	} else {
   177  		memoryUsage = ra.aggregateMetrics(podList, memoryMetricsToGather)
   178  	}
   179  
   180  	return map[v1.ResourceName]float64{
   181  		v1.ResourceCPU:    cpuUsage,
   182  		v1.ResourceMemory: memoryUsage,
   183  	}
   184  }
   185  
   186  func (ra *RealtimeOvercommitmentAdvisor) aggregateMetrics(podList []*v1.Pod, metrics []string) float64 {
   187  	var (
   188  		res         float64
   189  		metricValue float64
   190  		reference   string
   191  	)
   192  
   193  	for _, pod := range podList {
   194  		metricValue = 0
   195  		reference = ""
   196  
   197  		for _, metricName := range metrics {
   198  			metricData := ra.metaServer.AggregatePodMetric([]*v1.Pod{pod}, metricName, metric.AggregatorSum, metric.DefaultContainerMetricFilter)
   199  			if klog.V(5).Enabled() {
   200  				general.Infof("pod %v metric %v value %v", pod.Name, metricName, metricData.Value)
   201  			}
   202  			if metricData.Value <= 0 {
   203  				continue
   204  			}
   205  
   206  			if metricData.Value > metricValue {
   207  				metricValue = metricData.Value
   208  				reference = metricName
   209  			}
   210  		}
   211  
   212  		if klog.V(5).Enabled() {
   213  			general.Infof("pod %v aggregateCPU value %v reference %v", pod.Name, metricValue, reference)
   214  		}
   215  		res += metricValue
   216  	}
   217  
   218  	return res
   219  }
   220  
   221  func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableResource() error {
   222  	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   223  	defer cancel()
   224  
   225  	kconfig, err := ra.metaServer.GetKubeletConfig(ctx)
   226  	if err != nil {
   227  		klog.Errorf("get kubeletconfig fail: %v", err)
   228  		return err
   229  	}
   230  
   231  	reservedCPU, found, err := utilkubeconfig.GetReservedQuantity(kconfig, string(v1.ResourceCPU))
   232  	if err != nil {
   233  		klog.Errorf("GetKubeletReservedQuantity fail: %v", err)
   234  		return err
   235  	} else if !found {
   236  		reservedCPU = *resource.NewQuantity(0, resource.DecimalSI)
   237  	}
   238  
   239  	reservedMemory, found, err := utilkubeconfig.GetReservedQuantity(kconfig, string(v1.ResourceMemory))
   240  	if err != nil {
   241  		klog.Errorf("GetKubeletReservedQuantity fail: %v", err)
   242  		return err
   243  	} else if !found {
   244  		reservedMemory = *resource.NewQuantity(0, resource.BinarySI)
   245  	}
   246  
   247  	ra.syncAllocatableCPU(reservedCPU)
   248  	ra.syncAllocatableMemory(reservedMemory)
   249  	return nil
   250  }
   251  
   252  func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableCPU(reserved resource.Quantity) {
   253  	capacity := resource.NewMilliQuantity(int64(ra.metaServer.MachineInfo.NumCores*1000), resource.DecimalSI)
   254  	capacity.Sub(reserved)
   255  
   256  	ra.mutex.Lock()
   257  	ra.resourceAllocatable[v1.ResourceCPU] = *capacity
   258  	ra.mutex.Unlock()
   259  
   260  	klog.V(5).Infof("node allocatable cpu %v, reserved cpu %v", capacity.String(), reserved.String())
   261  }
   262  
   263  func (ra *RealtimeOvercommitmentAdvisor) syncAllocatableMemory(reserved resource.Quantity) {
   264  	capacity := resource.NewQuantity(int64(ra.metaServer.MemoryCapacity), resource.BinarySI)
   265  
   266  	capacity.Sub(reserved)
   267  
   268  	ra.mutex.Lock()
   269  	ra.resourceAllocatable[v1.ResourceMemory] = *capacity
   270  	ra.mutex.Unlock()
   271  
   272  	klog.V(5).Infof("node allocatable memory %v, reserved memory %v", capacity.String(), reserved.String())
   273  }
   274  
   275  func (ra *RealtimeOvercommitmentAdvisor) metricsToOvercommitRatio(resourceRequest v1.ResourceList, resourceUsage map[v1.ResourceName]float64) {
   276  	cpuOvercommitRatio := ra.resourceMetricsToOvercommitRatio(v1.ResourceCPU, *resourceRequest.Cpu(), resourceUsage[v1.ResourceCPU])
   277  
   278  	memoryOvercommitRatio := ra.resourceMetricsToOvercommitRatio(v1.ResourceMemory, *resourceRequest.Memory(), resourceUsage[v1.ResourceMemory])
   279  
   280  	ra.mutex.Lock()
   281  	ra.resourceOvercommitRatio[v1.ResourceCPU] = cpuOvercommitRatio
   282  	ra.resourceOvercommitRatio[v1.ResourceMemory] = memoryOvercommitRatio
   283  	ra.mutex.Unlock()
   284  }
   285  
   286  func (ra *RealtimeOvercommitmentAdvisor) resourceMetricsToOvercommitRatio(resourceName v1.ResourceName, resourceRequest resource.Quantity, usage float64) float64 {
   287  	ra.mutex.RLock()
   288  	resourceAllocatable, ok := ra.resourceAllocatable[resourceName]
   289  	ra.mutex.RUnlock()
   290  
   291  	if !ok {
   292  		klog.Errorf("resource %v not exist in resourceAllocatable map", resourceName)
   293  		return 1.0
   294  	}
   295  
   296  	allocatable := resourceAllocatable.MilliValue()
   297  	request := resourceRequest.MilliValue()
   298  	usage = usage * 1000
   299  
   300  	if request == 0 || allocatable == 0 {
   301  		klog.Warningf("unexpected node resource, resourceName: %v, request: %v, allocatable: %v", resourceName, request, allocatable)
   302  		return 1.0
   303  	}
   304  
   305  	existedPodLoad := usage / float64(request)
   306  	if existedPodLoad > 1 {
   307  		existedPodLoad = 1
   308  	}
   309  	var podExpectedLoad, nodeTargetLoad float64
   310  	switch resourceName {
   311  	case v1.ResourceCPU:
   312  		podExpectedLoad = ra.podEstimatedCPULoad
   313  		nodeTargetLoad = ra.nodeTargetCPULoad
   314  	case v1.ResourceMemory:
   315  		podExpectedLoad = ra.podEstimatedMemoryLoad
   316  		nodeTargetLoad = ra.nodeTargetMemoryLoad
   317  	default:
   318  		klog.Warningf("unknow resourceName: %v", resourceName)
   319  		return 1.0
   320  	}
   321  	if existedPodLoad < podExpectedLoad {
   322  		existedPodLoad = podExpectedLoad
   323  	}
   324  
   325  	overcommitRatio := ((float64(allocatable)*nodeTargetLoad-usage)/existedPodLoad + float64(request)) / float64(allocatable)
   326  
   327  	klog.V(5).Infof("resource %v request: %v, allocatable: %v, usage: %v, targetLoad: %v, existLoad: %v, overcommitRatio: %v",
   328  		resourceName, request, allocatable, usage, nodeTargetLoad, existedPodLoad, overcommitRatio)
   329  	if overcommitRatio < 1.0 {
   330  		overcommitRatio = 1.0
   331  	}
   332  	return overcommitRatio
   333  }
   334  
   335  func sumUpPodsResources(podList []*v1.Pod) v1.ResourceList {
   336  	var (
   337  		podsCPURequest    = resource.NewQuantity(0, resource.DecimalSI)
   338  		podsMemoryRequest = resource.NewQuantity(0, resource.BinarySI)
   339  	)
   340  
   341  	for _, pod := range podList {
   342  		podResource := native.SumUpPodRequestResources(pod)
   343  
   344  		cpuRequest := podResource.Cpu()
   345  		memoryRequest := podResource.Memory()
   346  
   347  		podsCPURequest.Add(*cpuRequest)
   348  		podsMemoryRequest.Add(*memoryRequest)
   349  	}
   350  
   351  	return v1.ResourceList{
   352  		v1.ResourceCPU:    podsCPURequest.DeepCopy(),
   353  		v1.ResourceMemory: podsMemoryRequest.DeepCopy(),
   354  	}
   355  }
   356  
   357  func (ra *RealtimeOvercommitmentAdvisor) GetOvercommitRatio() (map[v1.ResourceName]float64, error) {
   358  	res := map[v1.ResourceName]float64{
   359  		v1.ResourceCPU:    1.0,
   360  		v1.ResourceMemory: 1.0,
   361  	}
   362  
   363  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*2)
   364  	defer cancel()
   365  	node, err := ra.metaServer.GetNode(ctx)
   366  	if err != nil {
   367  		klog.Error("GetOvercommitRatio getNode fail: %v", err)
   368  		return nil, err
   369  	}
   370  	if cpuOvercommitRatioAnno, ok := node.Annotations[apiconsts.NodeAnnotationCPUOvercommitRatioKey]; ok {
   371  		cpuOvercommitRatio, err := strconv.ParseFloat(cpuOvercommitRatioAnno, 64)
   372  		if err != nil {
   373  			klog.Errorf("%s parse fail: %v", cpuOvercommitRatioAnno, err)
   374  		} else {
   375  			res[v1.ResourceCPU] = cpuOvercommitRatio
   376  		}
   377  	}
   378  	if memOvercommitRatioAnno, ok := node.Annotations[apiconsts.NodeAnnotationMemoryOvercommitRatioKey]; ok {
   379  		memOvercommitRatio, err := strconv.ParseFloat(memOvercommitRatioAnno, 64)
   380  		if err != nil {
   381  			klog.Errorf("%s parse fail: %v", memOvercommitRatioAnno, err)
   382  		} else {
   383  			res[v1.ResourceMemory] = memOvercommitRatio
   384  		}
   385  	}
   386  
   387  	ra.mutex.RLock()
   388  	defer ra.mutex.RUnlock()
   389  
   390  	if len(ra.resourceOvercommitRatio) <= 0 {
   391  		return map[v1.ResourceName]float64{}, nil
   392  	}
   393  
   394  	// only report when overcommit ratio less than the set value
   395  	for resourceName, overcommitRatio := range ra.resourceOvercommitRatio {
   396  		if overcommitRatio >= res[resourceName] {
   397  			continue
   398  		}
   399  		res[resourceName] = overcommitRatio
   400  	}
   401  
   402  	return res, nil
   403  }