github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/agent/metric/metric_impl.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metric
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math/rand"
    23  	"sync"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/util/wait"
    28  
    29  	"github.com/kubewharf/katalyst-core/pkg/config/agent/global"
    30  	"github.com/kubewharf/katalyst-core/pkg/config/agent/metaserver"
    31  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types"
    32  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod"
    33  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    34  	"github.com/kubewharf/katalyst-core/pkg/util/machine"
    35  	utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric"
    36  	"github.com/kubewharf/katalyst-core/pkg/util/syntax"
    37  )
    38  
    39  type MetricsNotifierManagerImpl struct {
    40  	*syntax.RWMutex
    41  	metricStore        *utilmetric.MetricStore
    42  	registeredNotifier map[types.MetricsScope]map[string]*types.NotifiedData
    43  }
    44  
    45  func NewMetricsNotifierManager(metricStore *utilmetric.MetricStore, emitter metrics.MetricEmitter) types.MetricsNotifierManager {
    46  	return &MetricsNotifierManagerImpl{
    47  		metricStore: metricStore,
    48  		RWMutex:     syntax.NewRWMutex(emitter),
    49  		registeredNotifier: map[types.MetricsScope]map[string]*types.NotifiedData{
    50  			types.MetricsScopeNode:          make(map[string]*types.NotifiedData),
    51  			types.MetricsScopeNuma:          make(map[string]*types.NotifiedData),
    52  			types.MetricsScopeCPU:           make(map[string]*types.NotifiedData),
    53  			types.MetricsScopeDevice:        make(map[string]*types.NotifiedData),
    54  			types.MetricsScopeContainer:     make(map[string]*types.NotifiedData),
    55  			types.MetricsScopeContainerNUMA: make(map[string]*types.NotifiedData),
    56  		},
    57  	}
    58  }
    59  
    60  func (m *MetricsNotifierManagerImpl) RegisterNotifier(scope types.MetricsScope, req types.NotifiedRequest,
    61  	response chan types.NotifiedResponse,
    62  ) string {
    63  	if _, ok := m.registeredNotifier[scope]; !ok {
    64  		return ""
    65  	}
    66  
    67  	m.Lock()
    68  	defer m.Unlock()
    69  
    70  	randBytes := make([]byte, 30)
    71  	rand.Read(randBytes)
    72  	key := string(randBytes)
    73  
    74  	m.registeredNotifier[scope][key] = &types.NotifiedData{
    75  		Scope:    scope,
    76  		Req:      req,
    77  		Response: response,
    78  	}
    79  	return key
    80  }
    81  
    82  func (m *MetricsNotifierManagerImpl) DeRegisterNotifier(scope types.MetricsScope, key string) {
    83  	m.Lock()
    84  	defer m.Unlock()
    85  
    86  	delete(m.registeredNotifier[scope], key)
    87  }
    88  
    89  func (m *MetricsNotifierManagerImpl) Notify() {
    90  	m.notifySystem()
    91  	m.notifyPods()
    92  }
    93  
    94  // notifySystem notifies system-related data
    95  func (m *MetricsNotifierManagerImpl) notifySystem() {
    96  	now := time.Now()
    97  	m.RLock()
    98  	defer m.RUnlock()
    99  
   100  	for _, reg := range m.registeredNotifier[types.MetricsScopeNode] {
   101  		v, err := m.metricStore.GetNodeMetric(reg.Req.MetricName)
   102  		if err != nil {
   103  			continue
   104  		} else if v.Time == nil {
   105  			v.Time = &now
   106  		}
   107  
   108  		if reg.LastNotify.Equal(*v.Time) {
   109  			continue
   110  		} else {
   111  			reg.LastNotify = *v.Time
   112  		}
   113  
   114  		reg.Response <- types.NotifiedResponse{
   115  			Req:        reg.Req,
   116  			MetricData: v,
   117  		}
   118  	}
   119  
   120  	for _, reg := range m.registeredNotifier[types.MetricsScopeDevice] {
   121  		v, err := m.metricStore.GetDeviceMetric(reg.Req.DeviceID, reg.Req.MetricName)
   122  		if err != nil {
   123  			continue
   124  		} else if v.Time == nil {
   125  			v.Time = &now
   126  		}
   127  
   128  		if reg.LastNotify.Equal(*v.Time) {
   129  			continue
   130  		} else {
   131  			reg.LastNotify = *v.Time
   132  		}
   133  
   134  		reg.Response <- types.NotifiedResponse{
   135  			Req:        reg.Req,
   136  			MetricData: v,
   137  		}
   138  	}
   139  
   140  	for n, reg := range m.registeredNotifier[types.MetricsScopeNuma] {
   141  		v, err := m.metricStore.GetNumaMetric(reg.Req.NumaID, reg.Req.MetricName)
   142  		if err != nil {
   143  			continue
   144  		} else if v.Time == nil {
   145  			v.Time = &now
   146  		}
   147  
   148  		if m.registeredNotifier[types.MetricsScopeNuma][n].LastNotify.Equal(*v.Time) {
   149  			continue
   150  		} else {
   151  			reg.LastNotify = *v.Time
   152  		}
   153  
   154  		reg.Response <- types.NotifiedResponse{
   155  			Req:        reg.Req,
   156  			MetricData: v,
   157  		}
   158  	}
   159  
   160  	for n, reg := range m.registeredNotifier[types.MetricsScopeCPU] {
   161  		v, err := m.metricStore.GetCPUMetric(reg.Req.CoreID, reg.Req.MetricName)
   162  		if err != nil {
   163  			continue
   164  		} else if v.Time == nil {
   165  			v.Time = &now
   166  		}
   167  
   168  		if reg.LastNotify.Equal(*v.Time) {
   169  			continue
   170  		} else {
   171  			m.registeredNotifier[types.MetricsScopeCPU][n].LastNotify = *v.Time
   172  		}
   173  
   174  		reg.Response <- types.NotifiedResponse{
   175  			Req:        reg.Req,
   176  			MetricData: v,
   177  		}
   178  	}
   179  }
   180  
   181  // notifySystem notifies pod-related data
   182  func (m *MetricsNotifierManagerImpl) notifyPods() {
   183  	now := time.Now()
   184  	m.RLock()
   185  	defer m.RUnlock()
   186  
   187  	for _, reg := range m.registeredNotifier[types.MetricsScopeContainer] {
   188  		v, err := m.metricStore.GetContainerMetric(reg.Req.PodUID, reg.Req.ContainerName, reg.Req.MetricName)
   189  		if err != nil {
   190  			continue
   191  		} else if v.Time == nil {
   192  			v.Time = &now
   193  		}
   194  
   195  		if reg.LastNotify.Equal(*v.Time) {
   196  			continue
   197  		} else {
   198  			reg.LastNotify = *v.Time
   199  		}
   200  
   201  		reg.Response <- types.NotifiedResponse{
   202  			Req:        reg.Req,
   203  			MetricData: v,
   204  		}
   205  	}
   206  
   207  	for _, reg := range m.registeredNotifier[types.MetricsScopeContainerNUMA] {
   208  		if reg.Req.NumaNode == "" {
   209  			continue
   210  		}
   211  
   212  		v, err := m.metricStore.GetContainerNumaMetric(reg.Req.PodUID, reg.Req.ContainerName, fmt.Sprintf("%v", reg.Req.NumaNode), reg.Req.MetricName)
   213  		if err != nil {
   214  			continue
   215  		} else if v.Time == nil {
   216  			v.Time = &now
   217  		}
   218  
   219  		if reg.LastNotify.Equal(*v.Time) {
   220  			continue
   221  		} else {
   222  			reg.LastNotify = *v.Time
   223  		}
   224  
   225  		reg.Response <- types.NotifiedResponse{
   226  			Req:        reg.Req,
   227  			MetricData: v,
   228  		}
   229  	}
   230  }
   231  
   232  type ExternalMetricManagerImpl struct {
   233  	*syntax.RWMutex
   234  	metricStore      *utilmetric.MetricStore
   235  	registeredMetric []func(store *utilmetric.MetricStore)
   236  }
   237  
   238  func NewExternalMetricManager(metricStore *utilmetric.MetricStore, emitter metrics.MetricEmitter) types.ExternalMetricManager {
   239  	return &ExternalMetricManagerImpl{
   240  		metricStore: metricStore,
   241  		RWMutex:     syntax.NewRWMutex(emitter),
   242  	}
   243  }
   244  
   245  func (m *ExternalMetricManagerImpl) RegisterExternalMetric(f func(store *utilmetric.MetricStore)) {
   246  	m.Lock()
   247  	defer m.Unlock()
   248  	m.registeredMetric = append(m.registeredMetric, f)
   249  }
   250  
   251  func (m *ExternalMetricManagerImpl) Sample() {
   252  	m.RLock()
   253  	defer m.RUnlock()
   254  	for _, f := range m.registeredMetric {
   255  		f(m.metricStore)
   256  	}
   257  }
   258  
   259  type MetricsFetcherImpl struct {
   260  	startOnce sync.Once
   261  	hasSynced bool
   262  
   263  	metricStore            *utilmetric.MetricStore
   264  	metricsNotifierManager types.MetricsNotifierManager
   265  	externalMetricManager  types.ExternalMetricManager
   266  	checkMetricDataExpire  CheckMetricDataExpireFunc
   267  
   268  	defaultInterval time.Duration
   269  	provisioners    map[string]types.MetricsProvisioner
   270  	intervals       map[string]time.Duration
   271  }
   272  
   273  func NewMetricsFetcher(baseConf *global.BaseConfiguration, metricConf *metaserver.MetricConfiguration, emitter metrics.MetricEmitter, podFetcher pod.PodFetcher) types.MetricsFetcher {
   274  	metricStore := utilmetric.NewMetricStore()
   275  	metricsNotifierManager := NewMetricsNotifierManager(metricStore, emitter)
   276  	externalMetricManager := NewExternalMetricManager(metricStore, emitter)
   277  
   278  	intervals := make(map[string]time.Duration)
   279  	provisioners := make(map[string]types.MetricsProvisioner)
   280  	registeredProvisioners := getProvisioners()
   281  	for _, name := range metricConf.MetricProvisions {
   282  		if f, ok := registeredProvisioners[name]; ok {
   283  			intervals[name] = metricConf.DefaultInterval
   284  			if interval, exist := metricConf.ProvisionerIntervals[name]; exist {
   285  				intervals[name] = interval
   286  			}
   287  			provisioners[name] = f(baseConf, metricConf, emitter, podFetcher, metricStore)
   288  		}
   289  	}
   290  
   291  	return &MetricsFetcherImpl{
   292  		metricStore:            metricStore,
   293  		metricsNotifierManager: metricsNotifierManager,
   294  		externalMetricManager:  externalMetricManager,
   295  		checkMetricDataExpire:  checkMetricDataExpireFunc(metricConf.MetricInsurancePeriod),
   296  
   297  		defaultInterval: metricConf.DefaultInterval,
   298  		provisioners:    provisioners,
   299  		intervals:       intervals,
   300  	}
   301  }
   302  
   303  func (f *MetricsFetcherImpl) GetNodeMetric(metricName string) (utilmetric.MetricData, error) {
   304  	return f.checkMetricDataExpire(f.metricStore.GetNodeMetric(metricName))
   305  }
   306  
   307  func (f *MetricsFetcherImpl) GetNumaMetric(numaID int, metricName string) (utilmetric.MetricData, error) {
   308  	return f.checkMetricDataExpire(f.metricStore.GetNumaMetric(numaID, metricName))
   309  }
   310  
   311  func (f *MetricsFetcherImpl) GetDeviceMetric(deviceName string, metricName string) (utilmetric.MetricData, error) {
   312  	return f.checkMetricDataExpire(f.metricStore.GetDeviceMetric(deviceName, metricName))
   313  }
   314  
   315  func (f *MetricsFetcherImpl) GetCPUMetric(coreID int, metricName string) (utilmetric.MetricData, error) {
   316  	return f.checkMetricDataExpire(f.metricStore.GetCPUMetric(coreID, metricName))
   317  }
   318  
   319  func (f *MetricsFetcherImpl) GetContainerMetric(podUID, containerName, metricName string) (utilmetric.MetricData, error) {
   320  	return f.checkMetricDataExpire(f.metricStore.GetContainerMetric(podUID, containerName, metricName))
   321  }
   322  
   323  func (f *MetricsFetcherImpl) GetContainerNumaMetric(podUID, containerName, numaNode, metricName string) (utilmetric.MetricData, error) {
   324  	return f.checkMetricDataExpire(f.metricStore.GetContainerNumaMetric(podUID, containerName, numaNode, metricName))
   325  }
   326  
   327  func (f *MetricsFetcherImpl) GetPodVolumeMetric(podUID, volumeName, metricName string) (utilmetric.MetricData, error) {
   328  	return f.checkMetricDataExpire(f.metricStore.GetPodVolumeMetric(podUID, volumeName, metricName))
   329  }
   330  
   331  func (f *MetricsFetcherImpl) GetCgroupMetric(cgroupPath, metricName string) (utilmetric.MetricData, error) {
   332  	return f.checkMetricDataExpire(f.metricStore.GetCgroupMetric(cgroupPath, metricName))
   333  }
   334  
   335  func (f *MetricsFetcherImpl) GetCgroupNumaMetric(cgroupPath string, numaNode int, metricName string) (utilmetric.MetricData, error) {
   336  	return f.checkMetricDataExpire(f.metricStore.GetCgroupNumaMetric(cgroupPath, numaNode, metricName))
   337  }
   338  
   339  func (f *MetricsFetcherImpl) AggregatePodNumaMetric(podList []*v1.Pod, numaNode, metricName string,
   340  	agg utilmetric.Aggregator, filter utilmetric.ContainerMetricFilter,
   341  ) utilmetric.MetricData {
   342  	return f.metricStore.AggregatePodNumaMetric(podList, numaNode, metricName, agg, filter)
   343  }
   344  
   345  func (f *MetricsFetcherImpl) AggregatePodMetric(podList []*v1.Pod, metricName string,
   346  	agg utilmetric.Aggregator, filter utilmetric.ContainerMetricFilter,
   347  ) utilmetric.MetricData {
   348  	return f.metricStore.AggregatePodMetric(podList, metricName, agg, filter)
   349  }
   350  
   351  func (f *MetricsFetcherImpl) AggregateCoreMetric(cpuset machine.CPUSet, metricName string, agg utilmetric.Aggregator) utilmetric.MetricData {
   352  	return f.metricStore.AggregateCoreMetric(cpuset, metricName, agg)
   353  }
   354  
   355  func (f *MetricsFetcherImpl) RegisterNotifier(scope types.MetricsScope, req types.NotifiedRequest, response chan types.NotifiedResponse) string {
   356  	return f.metricsNotifierManager.RegisterNotifier(scope, req, response)
   357  }
   358  
   359  func (f *MetricsFetcherImpl) DeRegisterNotifier(scope types.MetricsScope, key string) {
   360  	f.metricsNotifierManager.DeRegisterNotifier(scope, key)
   361  }
   362  
   363  func (f *MetricsFetcherImpl) RegisterExternalMetric(externalMetricFunc func(store *utilmetric.MetricStore)) {
   364  	f.externalMetricManager.RegisterExternalMetric(externalMetricFunc)
   365  }
   366  
   367  func (f *MetricsFetcherImpl) Run(ctx context.Context) {
   368  	// make sure all provisioners have started at least once,
   369  	// and then allow each provisioner to collect metrics with
   370  	// its specified period.
   371  	// whenever any provisioner finishes its collecting process,
   372  	// notification will be triggered, and the consumer should
   373  	// handler duplication logic if necessary.
   374  	f.startOnce.Do(func() {
   375  		f.init(ctx)
   376  		f.run(ctx)
   377  	})
   378  }
   379  
   380  func (f *MetricsFetcherImpl) init(ctx context.Context) {
   381  	wg := sync.WaitGroup{}
   382  	for name := range f.provisioners {
   383  		p := f.provisioners[name]
   384  		wg.Add(1)
   385  		go func() {
   386  			defer wg.Done()
   387  			p.Run(ctx)
   388  		}()
   389  	}
   390  	wg.Wait()
   391  
   392  	if f.externalMetricManager != nil {
   393  		f.externalMetricManager.Sample()
   394  	}
   395  
   396  	if f.metricsNotifierManager != nil {
   397  		f.metricsNotifierManager.Notify()
   398  	}
   399  
   400  	if !f.hasSynced {
   401  		f.hasSynced = true
   402  	}
   403  }
   404  
   405  func (f *MetricsFetcherImpl) run(ctx context.Context) {
   406  	// provisioner's implementation and its interval always exist,
   407  	// and it's ensured in init function
   408  	for name := range f.provisioners {
   409  		p := f.provisioners[name]
   410  		t := f.intervals[name]
   411  		go wait.Until(func() {
   412  			p.Run(ctx)
   413  			if f.metricsNotifierManager != nil {
   414  				f.metricsNotifierManager.Notify()
   415  			}
   416  		}, t, ctx.Done())
   417  	}
   418  
   419  	if f.externalMetricManager != nil {
   420  		go wait.Until(func() {
   421  			f.externalMetricManager.Sample()
   422  			if f.metricsNotifierManager != nil {
   423  				f.metricsNotifierManager.Notify()
   424  			}
   425  		}, f.defaultInterval, ctx.Done())
   426  	}
   427  }
   428  
   429  func (f *MetricsFetcherImpl) HasSynced() bool {
   430  	return f.hasSynced
   431  }