github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/monitor/cnr_indicator.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package monitor 18 19 import ( 20 "time" 21 22 v1 "k8s.io/api/core/v1" 23 "k8s.io/klog/v2" 24 25 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" 26 "github.com/kubewharf/katalyst-core/pkg/config/generic" 27 "github.com/kubewharf/katalyst-core/pkg/metrics" 28 "github.com/kubewharf/katalyst-core/pkg/util/native" 29 "github.com/kubewharf/katalyst-core/pkg/util/qos" 30 ) 31 32 const ( 33 metricsNameCNRReportAnomaly = "cnr_report_anomaly" 34 metricsNameCNRReportLantency = "cnr_report_latency" 35 ) 36 37 const ( 38 // reasonNumaExclusiveAnomaly is the reason for numa exclusive anomaly 39 // when numa_binding and numa_exclusive are both set 40 // the pod with numa_binding and numa_exclusive shares the numa with other pods 41 reasonNumaExclusiveAnomaly = "NumaExclusiveAnomaly" 42 // reasonNumaAllocatableSumAnomaly is the reason for numa allocatable sum anomaly 43 // when the node's sum of numa allocatable is not equal to the node allocatable 44 reasonNumaAllocatableSumAnomaly = "AllocatableSumAnomaly" 45 // reasonPodAllocationSumAnomaly is the reason for pod allocation sum anomaly 46 // when the numa's sum of pod allocation is greater than the numa allocatable 47 reasonPodAllocationSumAnomaly = "PodAllocationSumAnomaly" 48 ) 49 50 // checkNumaExclusiveAnomaly checks whether exist the pod with numa_binding and numa_exclusive shares the numa with other pods 51 func (ctrl *CNRMonitorController) checkNumaExclusiveAnomaly(cnr *v1alpha1.CustomNodeResource) bool { 52 qosConf := generic.NewQoSConfiguration() 53 for _, socket := range cnr.Status.TopologyZone { 54 for _, numa := range socket.Children { 55 if numa.Type != v1alpha1.TopologyTypeNuma { 56 // only check numa 57 continue 58 } 59 numabinding_pods := []*v1.Pod{} 60 // filter the pod with numa_binding 61 for _, allocation := range numa.Allocations { 62 key := allocation.Consumer 63 namespace, podname, _, err := native.ParseUniqObjectUIDKey(key) 64 if err != nil { 65 klog.Errorf("[CNRIndicatorNumaExclusiveAnomaly] failed to parse uniq object uid key %s", key) 66 continue 67 } 68 pod, err := ctrl.podLister.Pods(namespace).Get(podname) 69 if err != nil { 70 klog.Errorf("[CNRIndicatorNumaExclusiveAnomaly] failed to get pod %s", key) 71 continue 72 } 73 if qos.IsPodNumaBinding(qosConf, pod) { 74 numabinding_pods = append(numabinding_pods, pod) 75 } 76 } 77 // if the pod with numa_binding and numa_exclusive shares the numa with other pods, return true 78 for _, pod := range numabinding_pods { 79 if qos.IsPodNumaExclusive(qosConf, pod) && len(numabinding_pods) > 1 { 80 return true 81 } 82 } 83 } 84 } 85 return false 86 } 87 88 // checkNumaAllocatableSumAnomaly checks whether the node's sum of numa allocatable is not equal to the node allocatable 89 func (ctrl *CNRMonitorController) checkNumaAllocatableSumAnomaly(cnr *v1alpha1.CustomNodeResource) bool { 90 node, err := ctrl.nodeLister.Get(cnr.Name) 91 if err != nil { 92 klog.Errorf("[CNRIndicatorNumaAllocatableSumAnomaly] failed to get node %s", cnr.Name) 93 return false 94 } 95 96 nodeCpuAllocatable, nodeMemCapacity := int(node.Status.Allocatable.Cpu().AsApproximateFloat64()), int(node.Status.Capacity.Memory().AsApproximateFloat64()) 97 klog.Infof("[CNRIndicatorNumaAllocatableSumAnomaly] nodeCpuAllocatable: %d, nodeMemCapacity: %d", nodeCpuAllocatable, nodeMemCapacity) 98 numaCpuAllocatableSum, numaMemAllocatableSum := 0, 0 99 for _, socket := range cnr.Status.TopologyZone { 100 for _, numa := range socket.Children { 101 if numa.Type != v1alpha1.TopologyTypeNuma { 102 // only check numa 103 continue 104 } 105 numaCpuAllocatableSum += int(numa.Resources.Allocatable.Cpu().AsApproximateFloat64()) 106 numaMemAllocatableSum += int(numa.Resources.Allocatable.Memory().AsApproximateFloat64()) 107 } 108 } 109 klog.Infof("[CNRIndicatorNumaAllocatableSumAnomaly] numaCpuAllocatableSum: %d, numaMemAllocatableSum: %d", numaCpuAllocatableSum, numaMemAllocatableSum) 110 // TODO: thie rule maybe need to adapt to the scheduler in the future 111 if numaCpuAllocatableSum != nodeCpuAllocatable || numaMemAllocatableSum > nodeMemCapacity { 112 return true 113 } 114 return false 115 } 116 117 // checkPodAllocationSumAnomaly checks whether the numa's sum of pod allocation is greater than the numa allocatable 118 func (ctrl *CNRMonitorController) checkPodAllocationSumAnomaly(cnr *v1alpha1.CustomNodeResource) bool { 119 qosConf := generic.NewQoSConfiguration() 120 for _, socket := range cnr.Status.TopologyZone { 121 for _, numa := range socket.Children { 122 if numa.Type != v1alpha1.TopologyTypeNuma { 123 // only check numa 124 continue 125 } 126 numaCpuAllocatable, numaMemAllocatable := int(numa.Resources.Allocatable.Cpu().AsApproximateFloat64()), int(numa.Resources.Allocatable.Memory().AsApproximateFloat64()) 127 klog.Infof("[CNRIndicatorPodAllocationSumAnomaly] numaCpuAllocatable: %d, numaMemAllocatable: %d", numaCpuAllocatable, numaMemAllocatable) 128 podCpuAllocationSum, podMemAllocationSum := 0, 0 129 for _, allocation := range numa.Allocations { 130 key := allocation.Consumer 131 namespace, podname, _, err := native.ParseUniqObjectUIDKey(key) 132 if err != nil { 133 klog.Errorf("[CNRIndicatorPodAllocationSumAnomaly] failed to parse uniq object uid key %s", key) 134 continue 135 } 136 pod, err := ctrl.podLister.Pods(namespace).Get(podname) 137 if err != nil { 138 klog.Errorf("[CNRIndicatorPodAllocationSumAnomaly] failed to get pod %s", key) 139 continue 140 } 141 // only check the pod with numa binding for now 142 if qos.IsPodNumaBinding(qosConf, pod) { 143 podCpuAllocationSum += int(allocation.Requests.Cpu().AsApproximateFloat64()) 144 podMemAllocationSum += int(allocation.Requests.Memory().AsApproximateFloat64()) 145 } 146 } 147 klog.Infof("[CNRIndicatorPodAllocationSumAnomaly] podCpuAllocationSum: %d, podMemAllocationSum: %d", podCpuAllocationSum, podMemAllocationSum) 148 if podCpuAllocationSum > numaCpuAllocatable || podMemAllocationSum > numaMemAllocatable { 149 return true 150 } 151 } 152 } 153 return false 154 } 155 156 // emitCNRAnomalyMetric emit CNR anomaly metric 157 func (ctrl *CNRMonitorController) emitCNRAnomalyMetric(cnr *v1alpha1.CustomNodeResource, reason string) error { 158 _ = ctrl.metricsEmitter.StoreInt64(metricsNameCNRReportAnomaly, 1, metrics.MetricTypeNameRaw, 159 metrics.MetricTag{ 160 Key: "node_name", Val: cnr.Name, 161 }, 162 metrics.MetricTag{ 163 Key: "reason", Val: reason, 164 }, 165 ) 166 167 return nil 168 } 169 170 // checkAndEmitCNRReportLantencyMetric check and emit CNR report lantency metric 171 func (ctrl *CNRMonitorController) checkAndEmitCNRReportLantencyMetric(cnr *v1alpha1.CustomNodeResource) error { 172 for _, socket := range cnr.Status.TopologyZone { 173 for _, numa := range socket.Children { 174 if numa.Type != v1alpha1.TopologyTypeNuma { 175 // only check numa 176 continue 177 } 178 for _, allocation := range numa.Allocations { 179 key := allocation.Consumer 180 scheduledTime, ok := ctrl.podTimeMap.Load(key) 181 // if the pod is not in podTimeMap or if the podTimeMap value is zero, continue 182 if !ok || scheduledTime.(time.Time).IsZero() { 183 continue 184 } 185 // emit cnr report lantency metric 186 ctrl.emitCNRReportLantencyMetric(cnr.Name, key, time.Since(scheduledTime.(time.Time)).Milliseconds(), "false") 187 // delete the used data from podTimeMap 188 ctrl.podTimeMap.Delete(key) 189 } 190 } 191 } 192 return nil 193 } 194 195 // emitCNRReportLantencyMetric emit CNR report lantency metric 196 func (ctrl *CNRMonitorController) emitCNRReportLantencyMetric(nodeName string, key string, lantency int64, isTimeOut string) { 197 namespace, podName, uid, err := native.ParseUniqObjectUIDKey(key) 198 if err != nil { 199 klog.Errorf("[CNRReportLantency] failed to parse uniq object uid key %s", key) 200 } 201 klog.Infof("[CNRReportLantency] pod %s/%s/%s report lantency: %dms", namespace, podName, uid, lantency) 202 _ = ctrl.metricsEmitter.StoreFloat64(metricsNameCNRReportLantency, float64(lantency), 203 metrics.MetricTypeNameRaw, 204 metrics.MetricTag{ 205 Key: "node_name", Val: nodeName, 206 }, 207 metrics.MetricTag{ 208 Key: "namespace", Val: namespace, 209 }, 210 metrics.MetricTag{ 211 Key: "pod_name", Val: podName, 212 }, 213 metrics.MetricTag{ 214 Key: "pod_uid", Val: uid, 215 }, 216 metrics.MetricTag{ 217 Key: "time_out", Val: isTimeOut, 218 }, 219 ) 220 }