github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/agent/metric/provisioner/rodan/provisioner.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package rodan
    18  
    19  import (
    20  	"context"
    21  	"strconv"
    22  	"time"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/klog/v2"
    26  
    27  	"github.com/kubewharf/katalyst-core/pkg/config/agent/global"
    28  	"github.com/kubewharf/katalyst-core/pkg/config/agent/metaserver"
    29  	"github.com/kubewharf/katalyst-core/pkg/consts"
    30  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/provisioner/rodan/client"
    31  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/provisioner/rodan/types"
    32  	metrictypes "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types"
    33  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod"
    34  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    35  	"github.com/kubewharf/katalyst-core/pkg/util/cgroup/common"
    36  	utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric"
    37  )
    38  
    39  const (
    40  	pageShift = 12
    41  )
    42  
    43  type RodanMetricsProvisioner struct {
    44  	client      *client.RodanClient
    45  	metricStore *utilmetric.MetricStore
    46  	podFetcher  pod.PodFetcher
    47  	emitter     metrics.MetricEmitter
    48  
    49  	synced bool
    50  }
    51  
    52  // NewRodanMetricsProvisioner returns the fetcher that fetch metrics by Inspector client
    53  func NewRodanMetricsProvisioner(
    54  	_ *global.BaseConfiguration,
    55  	metricConf *metaserver.MetricConfiguration,
    56  	emitter metrics.MetricEmitter,
    57  	fetcher pod.PodFetcher,
    58  	metricStore *utilmetric.MetricStore,
    59  ) metrictypes.MetricsProvisioner {
    60  	return &RodanMetricsProvisioner{
    61  		metricStore: metricStore,
    62  		podFetcher:  fetcher,
    63  		client:      client.NewRodanClient(fetcher, nil, metricConf.RodanServerPort),
    64  		emitter:     emitter,
    65  		synced:      false,
    66  	}
    67  }
    68  
    69  func (i *RodanMetricsProvisioner) Run(ctx context.Context) {
    70  	i.sample(ctx)
    71  }
    72  
    73  func (i *RodanMetricsProvisioner) sample(ctx context.Context) {
    74  	i.updateNodeStats()
    75  	i.updateNUMAStats()
    76  	i.updateNodeCgroupStats()
    77  	i.updateNodeSysctlStats()
    78  	i.updateCoreStats()
    79  	i.updatePodStats(ctx)
    80  
    81  	i.synced = true
    82  }
    83  
    84  func (i *RodanMetricsProvisioner) HasSynced() bool {
    85  	return i.synced
    86  }
    87  
    88  func (i *RodanMetricsProvisioner) updateNodeStats() {
    89  	// update node memory stats
    90  	nodeMemoryData, err := i.client.GetNodeMemoryStats()
    91  	if err != nil {
    92  		klog.Errorf("[inspector] get node memory stats failed, err: %v", err)
    93  	} else {
    94  		i.processNodeMemoryData(nodeMemoryData)
    95  	}
    96  }
    97  
    98  // updateNodeCgroupStats update only besteffort and burstable QoS level cgroup stats
    99  func (i *RodanMetricsProvisioner) updateNodeCgroupStats() {
   100  	// update cgroup memory stats
   101  	memoryCgroupData, err := i.client.GetNodeCgroupMemoryStats()
   102  	if err != nil {
   103  		klog.Errorf("[inspector] get memory cgroup stats failed, err: %v", err)
   104  	} else {
   105  		i.processCgroupMemoryData(memoryCgroupData)
   106  	}
   107  }
   108  
   109  func (i *RodanMetricsProvisioner) updateNodeSysctlStats() {
   110  	// update node sysctl data
   111  	sysctlData, err := i.client.GetNodeSysctl()
   112  	if err != nil {
   113  		klog.Errorf("[inspector] get node sysctl failed, err: %v", err)
   114  	} else {
   115  		i.processNodeSysctlData(sysctlData)
   116  	}
   117  }
   118  
   119  func (i *RodanMetricsProvisioner) updateNUMAStats() {
   120  	// update NUMA memory stats
   121  	NUMAMemoryData, err := i.client.GetNUMAMemoryStats()
   122  	if err != nil {
   123  		klog.Errorf("[inspector] get NUMA memory stats failed, err: %v", err)
   124  	} else {
   125  		i.processNUMAMemoryData(NUMAMemoryData)
   126  	}
   127  }
   128  
   129  func (i *RodanMetricsProvisioner) updateCoreStats() {
   130  	// update core CPU stats
   131  	coreCPUData, err := i.client.GetCoreCPUStats()
   132  	if err != nil {
   133  		klog.Errorf("[inspector] get core CPU stats failed, err: %v", err)
   134  	} else {
   135  		i.processCoreCPUData(coreCPUData)
   136  	}
   137  }
   138  
   139  func (i *RodanMetricsProvisioner) updatePodStats(ctx context.Context) {
   140  	// list all pods
   141  	pods, err := i.podFetcher.GetPodList(ctx, func(_ *v1.Pod) bool { return true })
   142  	if err != nil {
   143  		klog.Errorf("[inspector] GetPodList fail: %v", err)
   144  		return
   145  	}
   146  
   147  	podUIDSet := make(map[string]bool)
   148  	for _, pod := range pods {
   149  		podUIDSet[string(pod.UID)] = true
   150  		cpuStats, err := i.client.GetPodContainerCPUStats(ctx, string(pod.UID))
   151  		if err != nil {
   152  			klog.Errorf("[inspector] get container CPU stats failed, pod: %v, err: %v", pod.Name, err)
   153  		} else {
   154  			for containerName, containerCPUStats := range cpuStats {
   155  				i.processContainerCPUData(string(pod.UID), containerName, containerCPUStats)
   156  			}
   157  		}
   158  
   159  		cgroupMemStats, err := i.client.GetPodContainerCgroupMemStats(ctx, string(pod.UID))
   160  		if err != nil {
   161  			klog.Errorf("[inspector] get container cgroupmem stats failed, pod: %v, err: %v", pod.Name, err)
   162  		} else {
   163  			for containerName, containerCgroupMem := range cgroupMemStats {
   164  				i.processContainerCgroupMemData(string(pod.UID), containerName, containerCgroupMem)
   165  			}
   166  		}
   167  
   168  		loadStats, err := i.client.GetPodContainerLoadStats(ctx, string(pod.UID))
   169  		if err != nil {
   170  			klog.Errorf("[inspector] get container load stats failed, pod: %v, err: %v", pod.Name, err)
   171  		} else {
   172  			for containerName, containerLoad := range loadStats {
   173  				i.processContainerLoadData(string(pod.UID), containerName, containerLoad)
   174  			}
   175  		}
   176  
   177  		cghardware, err := i.client.GetPodContainerCghardwareStats(ctx, string(pod.UID))
   178  		if err != nil {
   179  			klog.Errorf("[inspector] get container cghardware failed, pod: %v, err: %v", pod.Name, err)
   180  		} else {
   181  			for containerName, containerCghardware := range cghardware {
   182  				i.processContainerCghardwareData(string(pod.UID), containerName, containerCghardware)
   183  			}
   184  		}
   185  
   186  		cgNumaStats, err := i.client.GetPodContainerCgNumaStats(ctx, string(pod.UID))
   187  		if err != nil {
   188  			klog.Errorf("[inspector] get container numa stats failed, pod: %v, err: %v", pod.Name, err)
   189  		} else {
   190  			for containerName, containerNumaStats := range cgNumaStats {
   191  				i.processContainerNumaData(string(pod.UID), containerName, containerNumaStats)
   192  			}
   193  		}
   194  	}
   195  	i.metricStore.GCPodsMetric(podUIDSet)
   196  }
   197  
   198  func (i *RodanMetricsProvisioner) processNodeMemoryData(nodeMemoryData []types.Cell) {
   199  	updateTime := time.Now()
   200  
   201  	metricMap := types.MetricsMap[types.NodeMemoryPath]
   202  
   203  	for _, cell := range nodeMemoryData {
   204  		metricName, ok := metricMap[cell.Key]
   205  		if !ok {
   206  			continue
   207  		}
   208  		switch cell.Key {
   209  		case "memory_pgsteal_kswapd":
   210  			i.metricStore.SetNodeMetric(
   211  				metricName,
   212  				utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   213  			)
   214  		default:
   215  			i.metricStore.SetNodeMetric(
   216  				metricName,
   217  				utilmetric.MetricData{Value: float64(int(cell.Val) << 10), Time: &updateTime},
   218  			)
   219  		}
   220  	}
   221  }
   222  
   223  func (i *RodanMetricsProvisioner) processNodeSysctlData(nodeSysctlData []types.Cell) {
   224  	updateTime := time.Now()
   225  
   226  	metricMap := types.MetricsMap[types.NodeSysctlPath]
   227  
   228  	for _, cell := range nodeSysctlData {
   229  		metricName, ok := metricMap[cell.Key]
   230  		if !ok {
   231  			continue
   232  		}
   233  
   234  		i.metricStore.SetNodeMetric(
   235  			metricName,
   236  			utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   237  		)
   238  
   239  	}
   240  }
   241  
   242  func (i *RodanMetricsProvisioner) processCgroupMemoryData(cgroupMemoryData []types.Cell) {
   243  	updateTime := time.Now()
   244  
   245  	metricMap := types.MetricsMap[types.NodeCgroupMemoryPath]
   246  	for _, cell := range cgroupMemoryData {
   247  		metricName, ok := metricMap[cell.Key]
   248  		if !ok {
   249  			continue
   250  		}
   251  
   252  		switch cell.Key {
   253  		case "qosgroupmem_besteffort_memory_rss", "qosgroupmem_besteffort_memory_usage":
   254  			i.metricStore.SetCgroupMetric(common.CgroupFsRootPathBestEffort, metricName,
   255  				utilmetric.MetricData{Value: cell.Val, Time: &updateTime})
   256  		case "qosgroupmem_burstable_memory_rss", "qosgroupmem_burstable_memory_usage":
   257  			i.metricStore.SetCgroupMetric(common.CgroupFsRootPathBurstable, metricName,
   258  				utilmetric.MetricData{Value: cell.Val, Time: &updateTime})
   259  		}
   260  	}
   261  }
   262  
   263  func (i *RodanMetricsProvisioner) processNUMAMemoryData(NUMAMemoryData map[int][]types.Cell) {
   264  	updateTime := time.Now()
   265  
   266  	metricMap := types.MetricsMap[types.NumaMemoryPath]
   267  
   268  	for numaID, cells := range NUMAMemoryData {
   269  		for _, cell := range cells {
   270  			metricName, ok := metricMap[cell.Key]
   271  			if !ok {
   272  				continue
   273  			}
   274  
   275  			i.metricStore.SetNumaMetric(
   276  				numaID,
   277  				metricName,
   278  				utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   279  			)
   280  		}
   281  	}
   282  }
   283  
   284  func (i *RodanMetricsProvisioner) processCoreCPUData(coreCPUData map[int][]types.Cell) {
   285  	updateTime := time.Now()
   286  
   287  	metricMap := types.MetricsMap[types.NodeCPUPath]
   288  
   289  	for cpuID, coreData := range coreCPUData {
   290  		for _, cell := range coreData {
   291  			metricName, ok := metricMap[cell.Key]
   292  			if !ok {
   293  				continue
   294  			}
   295  
   296  			switch cell.Key {
   297  			case "usage":
   298  				// node cpu usage if cpuID == -1
   299  				if cpuID == -1 {
   300  					i.metricStore.SetNodeMetric(
   301  						consts.MetricCPUUsageRatio,
   302  						utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime},
   303  					)
   304  				} else {
   305  					i.metricStore.SetCPUMetric(
   306  						cpuID,
   307  						consts.MetricCPUUsageRatio,
   308  						utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime},
   309  					)
   310  				}
   311  			case "sched_wait":
   312  				i.metricStore.SetCPUMetric(
   313  					cpuID,
   314  					consts.MetricCPUSchedwait,
   315  					utilmetric.MetricData{Value: cell.Val * 1000, Time: &updateTime},
   316  				)
   317  			default:
   318  				i.metricStore.SetCPUMetric(
   319  					cpuID,
   320  					metricName,
   321  					utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   322  				)
   323  			}
   324  		}
   325  	}
   326  }
   327  
   328  func (i *RodanMetricsProvisioner) processContainerCPUData(podUID, containerName string, cpuData []types.Cell) {
   329  	var (
   330  		updateTime = time.Now()
   331  		metricMap  = types.MetricsMap[types.ContainerCPUPath]
   332  	)
   333  
   334  	for _, cell := range cpuData {
   335  		metricName, ok := metricMap[cell.Key]
   336  		if !ok {
   337  			continue
   338  		}
   339  
   340  		switch cell.Key {
   341  		case "cgcpu_usage":
   342  			i.metricStore.SetContainerMetric(
   343  				podUID,
   344  				containerName,
   345  				metricName,
   346  				utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime},
   347  			)
   348  		default:
   349  			i.metricStore.SetContainerMetric(
   350  				podUID,
   351  				containerName,
   352  				metricName,
   353  				utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   354  			)
   355  		}
   356  	}
   357  }
   358  
   359  func (i *RodanMetricsProvisioner) processContainerCghardwareData(podUID, containerName string, cghardwareData []types.Cell) {
   360  	var (
   361  		updateTime = time.Now()
   362  		metricMap  = types.MetricsMap[types.ContainerCghardwarePath]
   363  
   364  		cyclesOld, _         = i.metricStore.GetContainerMetric(podUID, containerName, consts.MetricCPUCyclesContainer)
   365  		instructionsOld, _   = i.metricStore.GetContainerMetric(podUID, containerName, consts.MetricCPUInstructionsContainer)
   366  		cycles, instructions float64
   367  	)
   368  
   369  	for _, cell := range cghardwareData {
   370  		metricName, ok := metricMap[cell.Key]
   371  		if !ok {
   372  			continue
   373  		}
   374  
   375  		i.metricStore.SetContainerMetric(
   376  			podUID,
   377  			containerName,
   378  			metricName,
   379  			utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   380  		)
   381  
   382  		if cell.Key == "cycles" {
   383  			cycles = cell.Val
   384  		}
   385  		if cell.Key == "instructions" {
   386  			instructions = cell.Val
   387  		}
   388  	}
   389  	if cyclesOld.Value > 0 && cycles > 0 && instructionsOld.Value > 0 && instructions > 0 {
   390  		instructionDiff := instructions - instructionsOld.Value
   391  		if instructionDiff > 0 {
   392  			cpi := (cycles - cyclesOld.Value) / instructionDiff
   393  			i.metricStore.SetContainerMetric(
   394  				podUID,
   395  				containerName,
   396  				consts.MetricCPUCPIContainer,
   397  				utilmetric.MetricData{Value: cpi, Time: &updateTime},
   398  			)
   399  		}
   400  	}
   401  }
   402  
   403  func (i *RodanMetricsProvisioner) processContainerCgroupMemData(podUID, containerName string, cgroupMemData []types.Cell) {
   404  	updateTime := time.Now()
   405  
   406  	metricMap := types.MetricsMap[types.ContainerCgroupMemoryPath]
   407  
   408  	for _, cell := range cgroupMemData {
   409  		metricName, ok := metricMap[cell.Key]
   410  		if !ok {
   411  			continue
   412  		}
   413  
   414  		i.metricStore.SetContainerMetric(
   415  			podUID,
   416  			containerName,
   417  			metricName,
   418  			utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   419  		)
   420  	}
   421  }
   422  
   423  func (i *RodanMetricsProvisioner) processContainerLoadData(podUID, containerName string, loadData []types.Cell) {
   424  	updateTime := time.Now()
   425  
   426  	metricMap := types.MetricsMap[types.ContainerLoadPath]
   427  
   428  	for _, cell := range loadData {
   429  		metricName, ok := metricMap[cell.Key]
   430  		if !ok {
   431  			continue
   432  		}
   433  
   434  		switch cell.Key {
   435  		case "loadavg_loadavg1", "loadavg_loadavg5", "loadavg_loadavg15":
   436  			i.metricStore.SetContainerMetric(
   437  				podUID,
   438  				containerName,
   439  				metricName,
   440  				utilmetric.MetricData{Value: cell.Val / 100.0, Time: &updateTime},
   441  			)
   442  		default:
   443  			i.metricStore.SetContainerMetric(
   444  				podUID,
   445  				containerName,
   446  				metricName,
   447  				utilmetric.MetricData{Value: cell.Val, Time: &updateTime},
   448  			)
   449  		}
   450  
   451  	}
   452  }
   453  
   454  func (i *RodanMetricsProvisioner) processContainerNumaData(podUID, containerName string, containerNumaData map[int][]types.Cell) {
   455  	updateTime := time.Now()
   456  
   457  	metricMap := types.MetricsMap[types.ContainerNumaStatPath]
   458  
   459  	for numaNode, cells := range containerNumaData {
   460  		for _, cell := range cells {
   461  			metricName, ok := metricMap[cell.Key]
   462  			if !ok {
   463  				continue
   464  			}
   465  
   466  			switch cell.Key {
   467  			case "filepage":
   468  				i.metricStore.SetContainerNumaMetric(podUID, containerName, strconv.Itoa(numaNode), metricName,
   469  					utilmetric.MetricData{Value: float64(int(cell.Val) << pageShift), Time: &updateTime})
   470  			default:
   471  
   472  			}
   473  		}
   474  	}
   475  }