github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/manager.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package fetcher is a framework to collect resources from multiple plugins
    18  // (both in-tree and out-of-tree implementations) and push contents to reporter
    19  // manager to assemble and update thrugh APIServer.
    20  package fetcher // import "github.com/kubewharf/katalyst-core/pkg/reportermanager/fetcher"
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"sync"
    26  	"time"
    27  
    28  	"google.golang.org/grpc/metadata"
    29  	"google.golang.org/grpc/status"
    30  	"k8s.io/apimachinery/pkg/util/errors"
    31  	"k8s.io/apimachinery/pkg/util/sets"
    32  	"k8s.io/apimachinery/pkg/util/wait"
    33  	"k8s.io/klog/v2"
    34  	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
    35  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    36  	cpmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
    37  
    38  	"github.com/kubewharf/katalyst-api/pkg/plugins/registration"
    39  	"github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1"
    40  	"github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/checkpoint"
    41  	"github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/kubelet"
    42  	"github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/plugin"
    43  	"github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/system"
    44  	"github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/reporter"
    45  	"github.com/kubewharf/katalyst-core/pkg/config"
    46  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    47  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    48  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    49  )
    50  
    51  const reporterManagerCheckpoint = "reporter_manager_checkpoint"
    52  
    53  const (
    54  	metricsNameGetContentCost       = "reporter_get_content_cost"
    55  	metricsNameGetContentPluginCost = "reporter_get_content_plugin_cost"
    56  	metricsNameGenericSyncCost      = "reporter_generic_sync_cost"
    57  )
    58  
    59  // ReporterPluginManager is used to manage in-tree or out-tree reporter plugin registrations and
    60  // get report content from these plugins to aggregate them into the Reporter Manager
    61  type ReporterPluginManager struct {
    62  	// callback is used for reporting in one time call.
    63  	callback plugin.ListAndWatchCallback
    64  
    65  	// map pluginName to its corresponding endpoint implementation
    66  	mutex          sync.Mutex
    67  	innerEndpoints sets.String
    68  	endpoints      map[string]plugin.Endpoint
    69  
    70  	checkpointManager checkpointmanager.CheckpointManager
    71  
    72  	reporter reporter.Manager
    73  	emitter  metrics.MetricEmitter
    74  
    75  	// reconcilePeriod is the duration between calls to sync.
    76  	reconcilePeriod time.Duration
    77  	syncFunc        func(ctx context.Context)
    78  
    79  	// healthzState records last time that the corresponding module is determined as healthy.
    80  	healthzState sync.Map
    81  }
    82  
    83  var innerReporterPluginsDisabledByDefault = sets.NewString()
    84  
    85  // NewReporterPluginManager creates a new reporter plugin manager.
    86  func NewReporterPluginManager(reporterMgr reporter.Manager, emitter metrics.MetricEmitter,
    87  	metaServer *metaserver.MetaServer, conf *config.Configuration,
    88  ) (*ReporterPluginManager, error) {
    89  	manager := &ReporterPluginManager{
    90  		innerEndpoints:  sets.NewString(),
    91  		endpoints:       make(map[string]plugin.Endpoint),
    92  		reporter:        reporterMgr,
    93  		emitter:         emitter,
    94  		reconcilePeriod: conf.CollectInterval,
    95  	}
    96  
    97  	manager.syncFunc = manager.genericSync
    98  	manager.callback = manager.genericCallback
    99  
   100  	checkpointManager, err := checkpointmanager.NewCheckpointManager(conf.CheckpointManagerDir)
   101  	if err != nil {
   102  		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
   103  	}
   104  	manager.checkpointManager = checkpointManager
   105  
   106  	// load remote endpoints report response information from disk.
   107  	err = manager.readCheckpoint()
   108  	if err != nil {
   109  		_ = emitter.StoreInt64("reporter_plugin_checkpoint_read_failed", 1, metrics.MetricTypeNameCount)
   110  		klog.Warningf("continue after failing to read checkpoint file. response info from reporter plugin may NOT be up-to-date. Err: %v", err)
   111  	}
   112  
   113  	// register inner reporter plugins
   114  	err = manager.registerInnerReporterPlugins(emitter, metaServer, conf, manager.genericCallback, newReporterPluginInitializers())
   115  	if err != nil {
   116  		return nil, fmt.Errorf("get inner reporter plugin failed: %s", err)
   117  	}
   118  
   119  	return manager, nil
   120  }
   121  
   122  // newReporterPluginInitializers adds in-tree reporter plugins into init function list
   123  func newReporterPluginInitializers() map[string]plugin.InitFunc {
   124  	innerReporterPluginInitializers := make(map[string]plugin.InitFunc)
   125  	innerReporterPluginInitializers[system.PluginName] = system.NewSystemReporterPlugin
   126  	innerReporterPluginInitializers[kubelet.PluginName] = kubelet.NewKubeletReporterPlugin
   127  	return innerReporterPluginInitializers
   128  }
   129  
   130  func (m *ReporterPluginManager) registerInnerReporterPlugins(emitter metrics.MetricEmitter,
   131  	metaServer *metaserver.MetaServer, conf *config.Configuration, callback plugin.ListAndWatchCallback,
   132  	innerReporterPluginInitializers map[string]plugin.InitFunc,
   133  ) error {
   134  	var errList []error
   135  
   136  	for pluginName, initFn := range innerReporterPluginInitializers {
   137  		if !general.IsNameEnabled(pluginName, innerReporterPluginsDisabledByDefault, conf.GenericReporterConfiguration.InnerPlugins) {
   138  			klog.Infof("reporter plugin %s is disabled", pluginName)
   139  			continue
   140  		}
   141  
   142  		curPlugin, err := initFn(emitter, metaServer, conf, callback)
   143  		if err != nil {
   144  			errList = append(errList, err)
   145  			continue
   146  		}
   147  
   148  		err = m.registerPlugin(pluginName, curPlugin)
   149  		if err != nil {
   150  			errList = append(errList, err)
   151  			continue
   152  		}
   153  
   154  		m.innerEndpoints.Insert(pluginName)
   155  	}
   156  
   157  	if len(errList) > 0 {
   158  		return errors.NewAggregate(errList)
   159  	}
   160  
   161  	return nil
   162  }
   163  
   164  // GetHandlerType get manage plugin type
   165  func (m *ReporterPluginManager) GetHandlerType() string {
   166  	return registration.ReporterPlugin
   167  }
   168  
   169  // ValidatePlugin is to validate the plugin info is supported
   170  func (m *ReporterPluginManager) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
   171  	klog.Infof("[reporter manager] get Plugin %s at Endpoint %s with versions %v", pluginName, endpoint, versions)
   172  
   173  	if !m.isVersionCompatibleWithPlugin(versions) {
   174  		return fmt.Errorf("reporter manager version, %s, is not among plugin supported versions %v", pluginapi.Version, versions)
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  // RegisterPlugin is to handle plugin register event
   181  func (m *ReporterPluginManager) RegisterPlugin(pluginName, endpoint string, _ []string) error {
   182  	klog.Infof("[reporter manager] registering Plugin %s at Endpoint %s", pluginName, endpoint)
   183  
   184  	var cache *v1alpha1.GetReportContentResponse
   185  	// if the plugin is already registered, use the old cache to avoid data loss
   186  	// when the plugin is re-registered.
   187  	m.mutex.Lock()
   188  	old, ok := m.endpoints[pluginName]
   189  	m.mutex.Unlock()
   190  	if ok {
   191  		cache = old.GetCache()
   192  	}
   193  
   194  	e, err := plugin.NewRemoteEndpoint(endpoint, pluginName, cache, m.emitter, m.callback)
   195  	if err != nil {
   196  		return fmt.Errorf("failed to dial device plugin with socketPath %s: %v", endpoint, err)
   197  	}
   198  
   199  	return m.registerPlugin(pluginName, e)
   200  }
   201  
   202  // DeRegisterPlugin is to handler plugin de-register event
   203  func (m *ReporterPluginManager) DeRegisterPlugin(pluginName string) {
   204  	m.mutex.Lock()
   205  	defer m.mutex.Unlock()
   206  
   207  	if e, ok := m.endpoints[pluginName]; ok {
   208  		e.Stop()
   209  		klog.Errorf("[reporter manager] reporter plugin %s has been deregistered", pluginName)
   210  		_ = m.emitter.StoreInt64("reporter_plugin_deregister", 1, metrics.MetricTypeNameCount,
   211  			metrics.ConvertMapToTags(map[string]string{
   212  				"plugin": pluginName,
   213  			})...)
   214  	}
   215  }
   216  
   217  // Run start the reporter plugin manager
   218  func (m *ReporterPluginManager) Run(ctx context.Context) {
   219  	go wait.UntilWithContext(ctx, m.syncFunc, m.reconcilePeriod)
   220  
   221  	klog.Infof("reporter plugin manager started")
   222  	m.reporter.Run(ctx)
   223  }
   224  
   225  func (m *ReporterPluginManager) isVersionCompatibleWithPlugin(versions []string) bool {
   226  	// todo: currently this is fine as we only have a single supported version. When we do need to support
   227  	// 	multiple versions in the future, we may need to extend this function to return a supported version.
   228  	// 	E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin,
   229  	// 	this function should return v1beta1
   230  	for _, version := range versions {
   231  		for _, supportedVersion := range v1alpha1.SupportedVersions {
   232  			if version == supportedVersion {
   233  				return true
   234  			}
   235  		}
   236  	}
   237  
   238  	return false
   239  }
   240  
   241  func (m *ReporterPluginManager) registerPlugin(pluginName string, e plugin.Endpoint) error {
   242  	m.registerEndpoint(pluginName, e)
   243  
   244  	success := make(chan bool)
   245  
   246  	go m.runEndpoint(pluginName, e, success)
   247  
   248  	select {
   249  	case pass := <-success:
   250  		if pass {
   251  			klog.Infof("plugin %s run success", pluginName)
   252  			return nil
   253  		}
   254  		return fmt.Errorf("failed to register plugin %s", pluginName)
   255  	}
   256  }
   257  
   258  func (m *ReporterPluginManager) registerEndpoint(pluginName string, e plugin.Endpoint) {
   259  	m.mutex.Lock()
   260  	defer m.mutex.Unlock()
   261  
   262  	old, ok := m.endpoints[pluginName]
   263  
   264  	if ok && !old.IsStopped() {
   265  		klog.Infof("stop old endpoint: %s", pluginName)
   266  		old.Stop()
   267  	}
   268  
   269  	m.endpoints[pluginName] = e
   270  	klog.Infof("registered plugin name %s", pluginName)
   271  }
   272  
   273  func (m *ReporterPluginManager) runEndpoint(pluginName string, e plugin.Endpoint, success chan<- bool) {
   274  	e.Run(success)
   275  	e.Stop()
   276  
   277  	_ = m.emitter.StoreInt64("reporter_plugin_unhealthy", 1, metrics.MetricTypeNameCount,
   278  		metrics.ConvertMapToTags(map[string]string{
   279  			"plugin": pluginName,
   280  		})...)
   281  	klog.Infof("reporter plugin %s became unhealthy", pluginName)
   282  }
   283  
   284  // genericCallback is triggered by ListAndWatch of plugin implementations;
   285  // the ListWatch function will store report content in Endpoint and send to manager,
   286  // and the manager can read it from Endpoint cache to obtain content changes initiative
   287  func (m *ReporterPluginManager) genericCallback(pluginName string, _ *v1alpha1.GetReportContentResponse) {
   288  	klog.Infof("genericCallback")
   289  	// get report content from each healthy Endpoint from cache, the last response
   290  	// from this plugin has been already stored to its Endpoint cache before this callback called
   291  	reportResponses, _ := m.getReportContent(true)
   292  
   293  	err := m.pushContents(context.Background(), reportResponses)
   294  	if err != nil {
   295  		_ = m.emitter.StoreInt64("reporter_plugin_lw_push_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{
   296  			{Key: "plugin", Val: pluginName},
   297  		}...)
   298  		klog.Errorf("report plugin %s in callback failed with error: %v", pluginName, err)
   299  	}
   300  }
   301  
   302  func (m *ReporterPluginManager) pushContents(ctx context.Context, reportResponses map[string]*v1alpha1.GetReportContentResponse) error {
   303  	if err := m.writeCheckpoint(reportResponses); err != nil {
   304  		klog.Errorf("writing checkpoint encountered %v", err)
   305  	}
   306  
   307  	return m.reporter.PushContents(ctx, reportResponses)
   308  }
   309  
   310  // genericSync periodically calls the Get function to obtain content changes
   311  func (m *ReporterPluginManager) genericSync(ctx context.Context) {
   312  	klog.Infof("genericSync")
   313  
   314  	begin := time.Now()
   315  	defer func() {
   316  		costs := time.Since(begin)
   317  		klog.InfoS("finished genericSync", "costs", costs)
   318  		_ = m.emitter.StoreInt64(metricsNameGenericSyncCost, costs.Microseconds(), metrics.MetricTypeNameRaw)
   319  	}()
   320  
   321  	// clear unhealthy plugin periodically
   322  	m.clearUnhealthyPlugin()
   323  
   324  	// get report content from each healthy Endpoint directly
   325  	reportResponses, _ := m.getReportContent(false)
   326  
   327  	pushErr := m.pushContents(ctx, reportResponses)
   328  	if pushErr != nil {
   329  		_ = m.emitter.StoreInt64("reporter_plugin_sync_push_failed", 1, metrics.MetricTypeNameCount)
   330  		klog.Errorf("report plugin failed with error: %v", pushErr)
   331  	}
   332  }
   333  
   334  // clearUnhealthyPlugin is to clear stopped plugins from cache which exceeded grace period
   335  func (m *ReporterPluginManager) clearUnhealthyPlugin() {
   336  	m.mutex.Lock()
   337  	defer m.mutex.Unlock()
   338  
   339  	for pluginName, e := range m.endpoints {
   340  		if e.StopGracePeriodExpired() {
   341  			delete(m.endpoints, pluginName)
   342  
   343  			klog.Warningf("plugin %s has been clear", pluginName)
   344  			_ = m.emitter.StoreInt64("reporter_plugin_clear", 1, metrics.MetricTypeNameCount,
   345  				metrics.ConvertMapToTags(map[string]string{
   346  					"plugin": pluginName,
   347  				})...)
   348  		}
   349  	}
   350  }
   351  
   352  // getReportContent is to get reportContent from plugins. if cacheFirst is true,
   353  // use plugin cache (when it is no nil), otherwise we call plugin directly.
   354  func (m *ReporterPluginManager) getReportContent(cacheFirst bool) (map[string]*v1alpha1.GetReportContentResponse, error) {
   355  	reportResponses := make(map[string]*v1alpha1.GetReportContentResponse)
   356  	errList := make([]error, 0)
   357  
   358  	begin := time.Now()
   359  	m.mutex.Lock()
   360  	defer func() {
   361  		m.mutex.Unlock()
   362  		costs := time.Since(begin)
   363  		klog.InfoS("finished getReportContent cnr", "costs", costs)
   364  		_ = m.emitter.StoreInt64(metricsNameGetContentCost, costs.Microseconds(), metrics.MetricTypeNameRaw)
   365  	}()
   366  
   367  	// get report content from each Endpoint
   368  	for pluginName, e := range m.endpoints {
   369  		var (
   370  			resp *v1alpha1.GetReportContentResponse
   371  			err  error
   372  		)
   373  
   374  		// if cacheFirst is false or cache response is nil, we will try to get report content directly from plugin
   375  		if cacheFirst {
   376  			cache := e.GetCache()
   377  			if cache != nil {
   378  				reportResponses[pluginName] = cache
   379  				continue
   380  			}
   381  		}
   382  
   383  		ctx := metadata.NewOutgoingContext(context.Background(), metadata.New(nil))
   384  		epBegin := time.Now()
   385  		resp, err = e.GetReportContent(ctx)
   386  		epCosts := time.Since(epBegin)
   387  		klog.InfoS("GetReportContent", "costs", epCosts, "pluginName", pluginName)
   388  		_ = m.emitter.StoreInt64(metricsNameGetContentPluginCost, epCosts.Microseconds(), metrics.MetricTypeNameRaw, []metrics.MetricTag{{Key: "plugin", Val: pluginName}}...)
   389  		if err != nil {
   390  			errList = append(errList, err)
   391  			s, _ := status.FromError(err)
   392  			_ = m.emitter.StoreInt64("reporter_plugin_get_content_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{
   393  				{Key: "code", Val: s.Code().String()},
   394  				{Key: "plugin", Val: pluginName},
   395  			}...)
   396  
   397  			klog.Errorf("GetReportContentResponse from %s Endpoint failed with error: %v", pluginName, err)
   398  			// if it gets report content failed, uses cached response
   399  			resp = e.GetCache()
   400  		}
   401  
   402  		reportResponses[pluginName] = resp
   403  	}
   404  
   405  	return reportResponses, errors.NewAggregate(errList)
   406  }
   407  
   408  func (m *ReporterPluginManager) writeCheckpoint(reportResponses map[string]*v1alpha1.GetReportContentResponse) error {
   409  	remoteResponses := make(map[string]*v1alpha1.GetReportContentResponse, 0)
   410  	// only write remote endpoint response to checkpoint
   411  	for name, response := range reportResponses {
   412  		if m.innerEndpoints.Has(name) {
   413  			continue
   414  		}
   415  		remoteResponses[name] = response
   416  	}
   417  	data := checkpoint.New(remoteResponses)
   418  	err := m.checkpointManager.CreateCheckpoint(reporterManagerCheckpoint, data)
   419  	if err != nil {
   420  		_ = m.emitter.StoreInt64("reporter_plugin_checkpoint_write_failed", 1, metrics.MetricTypeNameCount)
   421  		return fmt.Errorf("failed to write checkpoint file %q: %v", reporterManagerCheckpoint, err)
   422  	}
   423  	return nil
   424  }
   425  
   426  func (m *ReporterPluginManager) readCheckpoint() error {
   427  	reportResponses := make(map[string]*v1alpha1.GetReportContentResponse, 0)
   428  	cp := checkpoint.New(reportResponses)
   429  	err := m.checkpointManager.GetCheckpoint(reporterManagerCheckpoint, cp)
   430  	if err != nil {
   431  		if err == cpmerrors.ErrCheckpointNotFound {
   432  			klog.Warningf("failed to retrieve checkpoint for %q: %v", reporterManagerCheckpoint, err)
   433  			return nil
   434  		}
   435  		return err
   436  	}
   437  	reportResponses = cp.GetData()
   438  	m.mutex.Lock()
   439  	defer m.mutex.Unlock()
   440  	for name, response := range reportResponses {
   441  		// During start up, creates stopped remote endpoint so that the report content
   442  		// will stay zero till the corresponding device plugin re-registers.
   443  		m.endpoints[name] = plugin.NewStoppedRemoteEndpoint(name, response)
   444  	}
   445  	return nil
   446  }