github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/agent/metric/provisioner/rodan/provisioner.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package rodan 18 19 import ( 20 "context" 21 "strconv" 22 "time" 23 24 v1 "k8s.io/api/core/v1" 25 "k8s.io/klog/v2" 26 27 "github.com/kubewharf/katalyst-core/pkg/config/agent/global" 28 "github.com/kubewharf/katalyst-core/pkg/config/agent/metaserver" 29 "github.com/kubewharf/katalyst-core/pkg/consts" 30 "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/provisioner/rodan/client" 31 "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/provisioner/rodan/types" 32 metrictypes "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types" 33 "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" 34 "github.com/kubewharf/katalyst-core/pkg/metrics" 35 "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" 36 utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" 37 ) 38 39 const ( 40 pageShift = 12 41 ) 42 43 type RodanMetricsProvisioner struct { 44 client *client.RodanClient 45 metricStore *utilmetric.MetricStore 46 podFetcher pod.PodFetcher 47 emitter metrics.MetricEmitter 48 49 synced bool 50 } 51 52 // NewRodanMetricsProvisioner returns the fetcher that fetch metrics by Inspector client 53 func NewRodanMetricsProvisioner( 54 _ *global.BaseConfiguration, 55 metricConf *metaserver.MetricConfiguration, 56 emitter metrics.MetricEmitter, 57 fetcher pod.PodFetcher, 58 metricStore *utilmetric.MetricStore, 59 ) metrictypes.MetricsProvisioner { 60 return &RodanMetricsProvisioner{ 61 metricStore: metricStore, 62 podFetcher: fetcher, 63 client: client.NewRodanClient(fetcher, nil, metricConf.RodanServerPort), 64 emitter: emitter, 65 synced: false, 66 } 67 } 68 69 func (i *RodanMetricsProvisioner) Run(ctx context.Context) { 70 i.sample(ctx) 71 } 72 73 func (i *RodanMetricsProvisioner) sample(ctx context.Context) { 74 i.updateNodeStats() 75 i.updateNUMAStats() 76 i.updateNodeCgroupStats() 77 i.updateNodeSysctlStats() 78 i.updateCoreStats() 79 i.updatePodStats(ctx) 80 81 i.synced = true 82 } 83 84 func (i *RodanMetricsProvisioner) HasSynced() bool { 85 return i.synced 86 } 87 88 func (i *RodanMetricsProvisioner) updateNodeStats() { 89 // update node memory stats 90 nodeMemoryData, err := i.client.GetNodeMemoryStats() 91 if err != nil { 92 klog.Errorf("[inspector] get node memory stats failed, err: %v", err) 93 } else { 94 i.processNodeMemoryData(nodeMemoryData) 95 } 96 } 97 98 // updateNodeCgroupStats update only besteffort and burstable QoS level cgroup stats 99 func (i *RodanMetricsProvisioner) updateNodeCgroupStats() { 100 // update cgroup memory stats 101 memoryCgroupData, err := i.client.GetNodeCgroupMemoryStats() 102 if err != nil { 103 klog.Errorf("[inspector] get memory cgroup stats failed, err: %v", err) 104 } else { 105 i.processCgroupMemoryData(memoryCgroupData) 106 } 107 } 108 109 func (i *RodanMetricsProvisioner) updateNodeSysctlStats() { 110 // update node sysctl data 111 sysctlData, err := i.client.GetNodeSysctl() 112 if err != nil { 113 klog.Errorf("[inspector] get node sysctl failed, err: %v", err) 114 } else { 115 i.processNodeSysctlData(sysctlData) 116 } 117 } 118 119 func (i *RodanMetricsProvisioner) updateNUMAStats() { 120 // update NUMA memory stats 121 NUMAMemoryData, err := i.client.GetNUMAMemoryStats() 122 if err != nil { 123 klog.Errorf("[inspector] get NUMA memory stats failed, err: %v", err) 124 } else { 125 i.processNUMAMemoryData(NUMAMemoryData) 126 } 127 } 128 129 func (i *RodanMetricsProvisioner) updateCoreStats() { 130 // update core CPU stats 131 coreCPUData, err := i.client.GetCoreCPUStats() 132 if err != nil { 133 klog.Errorf("[inspector] get core CPU stats failed, err: %v", err) 134 } else { 135 i.processCoreCPUData(coreCPUData) 136 } 137 } 138 139 func (i *RodanMetricsProvisioner) updatePodStats(ctx context.Context) { 140 // list all pods 141 pods, err := i.podFetcher.GetPodList(ctx, func(_ *v1.Pod) bool { return true }) 142 if err != nil { 143 klog.Errorf("[inspector] GetPodList fail: %v", err) 144 return 145 } 146 147 podUIDSet := make(map[string]bool) 148 for _, pod := range pods { 149 podUIDSet[string(pod.UID)] = true 150 cpuStats, err := i.client.GetPodContainerCPUStats(ctx, string(pod.UID)) 151 if err != nil { 152 klog.Errorf("[inspector] get container CPU stats failed, pod: %v, err: %v", pod.Name, err) 153 } else { 154 for containerName, containerCPUStats := range cpuStats { 155 i.processContainerCPUData(string(pod.UID), containerName, containerCPUStats) 156 } 157 } 158 159 cgroupMemStats, err := i.client.GetPodContainerCgroupMemStats(ctx, string(pod.UID)) 160 if err != nil { 161 klog.Errorf("[inspector] get container cgroupmem stats failed, pod: %v, err: %v", pod.Name, err) 162 } else { 163 for containerName, containerCgroupMem := range cgroupMemStats { 164 i.processContainerCgroupMemData(string(pod.UID), containerName, containerCgroupMem) 165 } 166 } 167 168 loadStats, err := i.client.GetPodContainerLoadStats(ctx, string(pod.UID)) 169 if err != nil { 170 klog.Errorf("[inspector] get container load stats failed, pod: %v, err: %v", pod.Name, err) 171 } else { 172 for containerName, containerLoad := range loadStats { 173 i.processContainerLoadData(string(pod.UID), containerName, containerLoad) 174 } 175 } 176 177 cghardware, err := i.client.GetPodContainerCghardwareStats(ctx, string(pod.UID)) 178 if err != nil { 179 klog.Errorf("[inspector] get container cghardware failed, pod: %v, err: %v", pod.Name, err) 180 } else { 181 for containerName, containerCghardware := range cghardware { 182 i.processContainerCghardwareData(string(pod.UID), containerName, containerCghardware) 183 } 184 } 185 186 cgNumaStats, err := i.client.GetPodContainerCgNumaStats(ctx, string(pod.UID)) 187 if err != nil { 188 klog.Errorf("[inspector] get container numa stats failed, pod: %v, err: %v", pod.Name, err) 189 } else { 190 for containerName, containerNumaStats := range cgNumaStats { 191 i.processContainerNumaData(string(pod.UID), containerName, containerNumaStats) 192 } 193 } 194 } 195 i.metricStore.GCPodsMetric(podUIDSet) 196 } 197 198 func (i *RodanMetricsProvisioner) processNodeMemoryData(nodeMemoryData []types.Cell) { 199 updateTime := time.Now() 200 201 metricMap := types.MetricsMap[types.NodeMemoryPath] 202 203 for _, cell := range nodeMemoryData { 204 metricName, ok := metricMap[cell.Key] 205 if !ok { 206 continue 207 } 208 switch cell.Key { 209 case "memory_pgsteal_kswapd": 210 i.metricStore.SetNodeMetric( 211 metricName, 212 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 213 ) 214 default: 215 i.metricStore.SetNodeMetric( 216 metricName, 217 utilmetric.MetricData{Value: float64(int(cell.Val) << 10), Time: &updateTime}, 218 ) 219 } 220 } 221 } 222 223 func (i *RodanMetricsProvisioner) processNodeSysctlData(nodeSysctlData []types.Cell) { 224 updateTime := time.Now() 225 226 metricMap := types.MetricsMap[types.NodeSysctlPath] 227 228 for _, cell := range nodeSysctlData { 229 metricName, ok := metricMap[cell.Key] 230 if !ok { 231 continue 232 } 233 234 i.metricStore.SetNodeMetric( 235 metricName, 236 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 237 ) 238 239 } 240 } 241 242 func (i *RodanMetricsProvisioner) processCgroupMemoryData(cgroupMemoryData []types.Cell) { 243 updateTime := time.Now() 244 245 metricMap := types.MetricsMap[types.NodeCgroupMemoryPath] 246 for _, cell := range cgroupMemoryData { 247 metricName, ok := metricMap[cell.Key] 248 if !ok { 249 continue 250 } 251 252 switch cell.Key { 253 case "qosgroupmem_besteffort_memory_rss", "qosgroupmem_besteffort_memory_usage": 254 i.metricStore.SetCgroupMetric(common.CgroupFsRootPathBestEffort, metricName, 255 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}) 256 case "qosgroupmem_burstable_memory_rss", "qosgroupmem_burstable_memory_usage": 257 i.metricStore.SetCgroupMetric(common.CgroupFsRootPathBurstable, metricName, 258 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}) 259 } 260 } 261 } 262 263 func (i *RodanMetricsProvisioner) processNUMAMemoryData(NUMAMemoryData map[int][]types.Cell) { 264 updateTime := time.Now() 265 266 metricMap := types.MetricsMap[types.NumaMemoryPath] 267 268 for numaID, cells := range NUMAMemoryData { 269 for _, cell := range cells { 270 metricName, ok := metricMap[cell.Key] 271 if !ok { 272 continue 273 } 274 275 i.metricStore.SetNumaMetric( 276 numaID, 277 metricName, 278 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 279 ) 280 } 281 } 282 } 283 284 func (i *RodanMetricsProvisioner) processCoreCPUData(coreCPUData map[int][]types.Cell) { 285 updateTime := time.Now() 286 287 metricMap := types.MetricsMap[types.NodeCPUPath] 288 289 for cpuID, coreData := range coreCPUData { 290 for _, cell := range coreData { 291 metricName, ok := metricMap[cell.Key] 292 if !ok { 293 continue 294 } 295 296 switch cell.Key { 297 case "usage": 298 // node cpu usage if cpuID == -1 299 if cpuID == -1 { 300 i.metricStore.SetNodeMetric( 301 consts.MetricCPUUsageRatio, 302 utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime}, 303 ) 304 } else { 305 i.metricStore.SetCPUMetric( 306 cpuID, 307 consts.MetricCPUUsageRatio, 308 utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime}, 309 ) 310 } 311 case "sched_wait": 312 i.metricStore.SetCPUMetric( 313 cpuID, 314 consts.MetricCPUSchedwait, 315 utilmetric.MetricData{Value: cell.Val * 1000, Time: &updateTime}, 316 ) 317 default: 318 i.metricStore.SetCPUMetric( 319 cpuID, 320 metricName, 321 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 322 ) 323 } 324 } 325 } 326 } 327 328 func (i *RodanMetricsProvisioner) processContainerCPUData(podUID, containerName string, cpuData []types.Cell) { 329 var ( 330 updateTime = time.Now() 331 metricMap = types.MetricsMap[types.ContainerCPUPath] 332 ) 333 334 for _, cell := range cpuData { 335 metricName, ok := metricMap[cell.Key] 336 if !ok { 337 continue 338 } 339 340 switch cell.Key { 341 case "cgcpu_usage": 342 i.metricStore.SetContainerMetric( 343 podUID, 344 containerName, 345 metricName, 346 utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime}, 347 ) 348 default: 349 i.metricStore.SetContainerMetric( 350 podUID, 351 containerName, 352 metricName, 353 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 354 ) 355 } 356 } 357 } 358 359 func (i *RodanMetricsProvisioner) processContainerCghardwareData(podUID, containerName string, cghardwareData []types.Cell) { 360 var ( 361 updateTime = time.Now() 362 metricMap = types.MetricsMap[types.ContainerCghardwarePath] 363 364 cyclesOld, _ = i.metricStore.GetContainerMetric(podUID, containerName, consts.MetricCPUCyclesContainer) 365 instructionsOld, _ = i.metricStore.GetContainerMetric(podUID, containerName, consts.MetricCPUInstructionsContainer) 366 cycles, instructions float64 367 ) 368 369 for _, cell := range cghardwareData { 370 metricName, ok := metricMap[cell.Key] 371 if !ok { 372 continue 373 } 374 375 i.metricStore.SetContainerMetric( 376 podUID, 377 containerName, 378 metricName, 379 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 380 ) 381 382 if cell.Key == "cycles" { 383 cycles = cell.Val 384 } 385 if cell.Key == "instructions" { 386 instructions = cell.Val 387 } 388 } 389 if cyclesOld.Value > 0 && cycles > 0 && instructionsOld.Value > 0 && instructions > 0 { 390 instructionDiff := instructions - instructionsOld.Value 391 if instructionDiff > 0 { 392 cpi := (cycles - cyclesOld.Value) / instructionDiff 393 i.metricStore.SetContainerMetric( 394 podUID, 395 containerName, 396 consts.MetricCPUCPIContainer, 397 utilmetric.MetricData{Value: cpi, Time: &updateTime}, 398 ) 399 } 400 } 401 } 402 403 func (i *RodanMetricsProvisioner) processContainerCgroupMemData(podUID, containerName string, cgroupMemData []types.Cell) { 404 updateTime := time.Now() 405 406 metricMap := types.MetricsMap[types.ContainerCgroupMemoryPath] 407 408 for _, cell := range cgroupMemData { 409 metricName, ok := metricMap[cell.Key] 410 if !ok { 411 continue 412 } 413 414 i.metricStore.SetContainerMetric( 415 podUID, 416 containerName, 417 metricName, 418 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 419 ) 420 } 421 } 422 423 func (i *RodanMetricsProvisioner) processContainerLoadData(podUID, containerName string, loadData []types.Cell) { 424 updateTime := time.Now() 425 426 metricMap := types.MetricsMap[types.ContainerLoadPath] 427 428 for _, cell := range loadData { 429 metricName, ok := metricMap[cell.Key] 430 if !ok { 431 continue 432 } 433 434 switch cell.Key { 435 case "loadavg_loadavg1", "loadavg_loadavg5", "loadavg_loadavg15": 436 i.metricStore.SetContainerMetric( 437 podUID, 438 containerName, 439 metricName, 440 utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime}, 441 ) 442 default: 443 i.metricStore.SetContainerMetric( 444 podUID, 445 containerName, 446 metricName, 447 utilmetric.MetricData{Value: cell.Val, Time: &updateTime}, 448 ) 449 } 450 451 } 452 } 453 454 func (i *RodanMetricsProvisioner) processContainerNumaData(podUID, containerName string, containerNumaData map[int][]types.Cell) { 455 updateTime := time.Now() 456 457 metricMap := types.MetricsMap[types.ContainerNumaStatPath] 458 459 for numaNode, cells := range containerNumaData { 460 for _, cell := range cells { 461 metricName, ok := metricMap[cell.Key] 462 if !ok { 463 continue 464 } 465 466 switch cell.Key { 467 case "filepage": 468 i.metricStore.SetContainerNumaMetric(podUID, containerName, strconv.Itoa(numaNode), metricName, 469 utilmetric.MetricData{Value: float64(int(cell.Val) << pageShift), Time: &updateTime}) 470 default: 471 472 } 473 } 474 } 475 }