github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/monitor/cnr_indicator.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package monitor
    18  
    19  import (
    20  	"time"
    21  
    22  	v1 "k8s.io/api/core/v1"
    23  	"k8s.io/klog/v2"
    24  
    25  	"github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    26  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    27  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    28  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    29  	"github.com/kubewharf/katalyst-core/pkg/util/qos"
    30  )
    31  
    32  const (
    33  	metricsNameCNRReportAnomaly  = "cnr_report_anomaly"
    34  	metricsNameCNRReportLantency = "cnr_report_latency"
    35  )
    36  
    37  const (
    38  	// reasonNumaExclusiveAnomaly is the reason for numa exclusive anomaly
    39  	// when numa_binding and numa_exclusive are both set
    40  	// the pod with numa_binding and numa_exclusive shares the numa with other pods
    41  	reasonNumaExclusiveAnomaly = "NumaExclusiveAnomaly"
    42  	// reasonNumaAllocatableSumAnomaly is the reason for numa allocatable sum anomaly
    43  	// when the node's sum of numa allocatable is not equal to the node allocatable
    44  	reasonNumaAllocatableSumAnomaly = "AllocatableSumAnomaly"
    45  	// reasonPodAllocationSumAnomaly is the reason for pod allocation sum anomaly
    46  	// when the numa's sum of pod allocation is greater than the numa allocatable
    47  	reasonPodAllocationSumAnomaly = "PodAllocationSumAnomaly"
    48  )
    49  
    50  // checkNumaExclusiveAnomaly checks whether exist the pod with numa_binding and numa_exclusive shares the numa with other pods
    51  func (ctrl *CNRMonitorController) checkNumaExclusiveAnomaly(cnr *v1alpha1.CustomNodeResource) bool {
    52  	qosConf := generic.NewQoSConfiguration()
    53  	for _, socket := range cnr.Status.TopologyZone {
    54  		for _, numa := range socket.Children {
    55  			if numa.Type != v1alpha1.TopologyTypeNuma {
    56  				// only check numa
    57  				continue
    58  			}
    59  			numabinding_pods := []*v1.Pod{}
    60  			// filter the pod with numa_binding
    61  			for _, allocation := range numa.Allocations {
    62  				key := allocation.Consumer
    63  				namespace, podname, _, err := native.ParseUniqObjectUIDKey(key)
    64  				if err != nil {
    65  					klog.Errorf("[CNRIndicatorNumaExclusiveAnomaly] failed to parse uniq object uid key %s", key)
    66  					continue
    67  				}
    68  				pod, err := ctrl.podLister.Pods(namespace).Get(podname)
    69  				if err != nil {
    70  					klog.Errorf("[CNRIndicatorNumaExclusiveAnomaly] failed to get pod %s", key)
    71  					continue
    72  				}
    73  				if qos.IsPodNumaBinding(qosConf, pod) {
    74  					numabinding_pods = append(numabinding_pods, pod)
    75  				}
    76  			}
    77  			// if the pod with numa_binding and numa_exclusive shares the numa with other pods, return true
    78  			for _, pod := range numabinding_pods {
    79  				if qos.IsPodNumaExclusive(qosConf, pod) && len(numabinding_pods) > 1 {
    80  					return true
    81  				}
    82  			}
    83  		}
    84  	}
    85  	return false
    86  }
    87  
    88  // checkNumaAllocatableSumAnomaly checks whether the node's sum of numa allocatable is not equal to the node allocatable
    89  func (ctrl *CNRMonitorController) checkNumaAllocatableSumAnomaly(cnr *v1alpha1.CustomNodeResource) bool {
    90  	node, err := ctrl.nodeLister.Get(cnr.Name)
    91  	if err != nil {
    92  		klog.Errorf("[CNRIndicatorNumaAllocatableSumAnomaly] failed to get node %s", cnr.Name)
    93  		return false
    94  	}
    95  
    96  	nodeCpuAllocatable, nodeMemCapacity := int(node.Status.Allocatable.Cpu().AsApproximateFloat64()), int(node.Status.Capacity.Memory().AsApproximateFloat64())
    97  	klog.Infof("[CNRIndicatorNumaAllocatableSumAnomaly] nodeCpuAllocatable: %d, nodeMemCapacity: %d", nodeCpuAllocatable, nodeMemCapacity)
    98  	numaCpuAllocatableSum, numaMemAllocatableSum := 0, 0
    99  	for _, socket := range cnr.Status.TopologyZone {
   100  		for _, numa := range socket.Children {
   101  			if numa.Type != v1alpha1.TopologyTypeNuma {
   102  				// only check numa
   103  				continue
   104  			}
   105  			numaCpuAllocatableSum += int(numa.Resources.Allocatable.Cpu().AsApproximateFloat64())
   106  			numaMemAllocatableSum += int(numa.Resources.Allocatable.Memory().AsApproximateFloat64())
   107  		}
   108  	}
   109  	klog.Infof("[CNRIndicatorNumaAllocatableSumAnomaly] numaCpuAllocatableSum: %d, numaMemAllocatableSum: %d", numaCpuAllocatableSum, numaMemAllocatableSum)
   110  	// TODO: thie rule maybe need to adapt to the scheduler in the future
   111  	if numaCpuAllocatableSum != nodeCpuAllocatable || numaMemAllocatableSum > nodeMemCapacity {
   112  		return true
   113  	}
   114  	return false
   115  }
   116  
   117  // checkPodAllocationSumAnomaly checks whether the numa's sum of pod allocation is greater than the numa allocatable
   118  func (ctrl *CNRMonitorController) checkPodAllocationSumAnomaly(cnr *v1alpha1.CustomNodeResource) bool {
   119  	qosConf := generic.NewQoSConfiguration()
   120  	for _, socket := range cnr.Status.TopologyZone {
   121  		for _, numa := range socket.Children {
   122  			if numa.Type != v1alpha1.TopologyTypeNuma {
   123  				// only check numa
   124  				continue
   125  			}
   126  			numaCpuAllocatable, numaMemAllocatable := int(numa.Resources.Allocatable.Cpu().AsApproximateFloat64()), int(numa.Resources.Allocatable.Memory().AsApproximateFloat64())
   127  			klog.Infof("[CNRIndicatorPodAllocationSumAnomaly] numaCpuAllocatable: %d, numaMemAllocatable: %d", numaCpuAllocatable, numaMemAllocatable)
   128  			podCpuAllocationSum, podMemAllocationSum := 0, 0
   129  			for _, allocation := range numa.Allocations {
   130  				key := allocation.Consumer
   131  				namespace, podname, _, err := native.ParseUniqObjectUIDKey(key)
   132  				if err != nil {
   133  					klog.Errorf("[CNRIndicatorPodAllocationSumAnomaly] failed to parse uniq object uid key %s", key)
   134  					continue
   135  				}
   136  				pod, err := ctrl.podLister.Pods(namespace).Get(podname)
   137  				if err != nil {
   138  					klog.Errorf("[CNRIndicatorPodAllocationSumAnomaly] failed to get pod %s", key)
   139  					continue
   140  				}
   141  				// only check the pod with numa binding for now
   142  				if qos.IsPodNumaBinding(qosConf, pod) {
   143  					podCpuAllocationSum += int(allocation.Requests.Cpu().AsApproximateFloat64())
   144  					podMemAllocationSum += int(allocation.Requests.Memory().AsApproximateFloat64())
   145  				}
   146  			}
   147  			klog.Infof("[CNRIndicatorPodAllocationSumAnomaly] podCpuAllocationSum: %d, podMemAllocationSum: %d", podCpuAllocationSum, podMemAllocationSum)
   148  			if podCpuAllocationSum > numaCpuAllocatable || podMemAllocationSum > numaMemAllocatable {
   149  				return true
   150  			}
   151  		}
   152  	}
   153  	return false
   154  }
   155  
   156  // emitCNRAnomalyMetric emit CNR anomaly metric
   157  func (ctrl *CNRMonitorController) emitCNRAnomalyMetric(cnr *v1alpha1.CustomNodeResource, reason string) error {
   158  	_ = ctrl.metricsEmitter.StoreInt64(metricsNameCNRReportAnomaly, 1, metrics.MetricTypeNameRaw,
   159  		metrics.MetricTag{
   160  			Key: "node_name", Val: cnr.Name,
   161  		},
   162  		metrics.MetricTag{
   163  			Key: "reason", Val: reason,
   164  		},
   165  	)
   166  
   167  	return nil
   168  }
   169  
   170  // checkAndEmitCNRReportLantencyMetric check and emit CNR report lantency metric
   171  func (ctrl *CNRMonitorController) checkAndEmitCNRReportLantencyMetric(cnr *v1alpha1.CustomNodeResource) error {
   172  	for _, socket := range cnr.Status.TopologyZone {
   173  		for _, numa := range socket.Children {
   174  			if numa.Type != v1alpha1.TopologyTypeNuma {
   175  				// only check numa
   176  				continue
   177  			}
   178  			for _, allocation := range numa.Allocations {
   179  				key := allocation.Consumer
   180  				scheduledTime, ok := ctrl.podTimeMap.Load(key)
   181  				// if the pod is not in podTimeMap or if the podTimeMap value is zero, continue
   182  				if !ok || scheduledTime.(time.Time).IsZero() {
   183  					continue
   184  				}
   185  				// emit cnr report lantency metric
   186  				ctrl.emitCNRReportLantencyMetric(cnr.Name, key, time.Since(scheduledTime.(time.Time)).Milliseconds(), "false")
   187  				// delete the used data from podTimeMap
   188  				ctrl.podTimeMap.Delete(key)
   189  			}
   190  		}
   191  	}
   192  	return nil
   193  }
   194  
   195  // emitCNRReportLantencyMetric emit CNR report lantency metric
   196  func (ctrl *CNRMonitorController) emitCNRReportLantencyMetric(nodeName string, key string, lantency int64, isTimeOut string) {
   197  	namespace, podName, uid, err := native.ParseUniqObjectUIDKey(key)
   198  	if err != nil {
   199  		klog.Errorf("[CNRReportLantency] failed to parse uniq object uid key %s", key)
   200  	}
   201  	klog.Infof("[CNRReportLantency] pod %s/%s/%s report lantency: %dms", namespace, podName, uid, lantency)
   202  	_ = ctrl.metricsEmitter.StoreFloat64(metricsNameCNRReportLantency, float64(lantency),
   203  		metrics.MetricTypeNameRaw,
   204  		metrics.MetricTag{
   205  			Key: "node_name", Val: nodeName,
   206  		},
   207  		metrics.MetricTag{
   208  			Key: "namespace", Val: namespace,
   209  		},
   210  		metrics.MetricTag{
   211  			Key: "pod_name", Val: podName,
   212  		},
   213  		metrics.MetricTag{
   214  			Key: "pod_uid", Val: uid,
   215  		},
   216  		metrics.MetricTag{
   217  			Key: "time_out", Val: isTimeOut,
   218  		},
   219  	)
   220  }