github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/reporter/cnr/cnrreporter.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cnr
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"sync"
    24  	"time"
    25  
    26  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    27  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/util/errors"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	"k8s.io/apimachinery/pkg/util/wait"
    32  	"k8s.io/klog/v2"
    33  
    34  	nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    35  	clientset "github.com/kubewharf/katalyst-api/pkg/client/clientset/versioned"
    36  	"github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1"
    37  	"github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/reporter"
    38  	"github.com/kubewharf/katalyst-core/pkg/client"
    39  	"github.com/kubewharf/katalyst-core/pkg/client/control"
    40  	"github.com/kubewharf/katalyst-core/pkg/config"
    41  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    42  	metaservercnr "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnr"
    43  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    44  	"github.com/kubewharf/katalyst-core/pkg/util"
    45  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    46  	"github.com/kubewharf/katalyst-core/pkg/util/syntax"
    47  )
    48  
    49  const (
    50  	cnrReporterName = "cnr-reporter"
    51  
    52  	// cnrUpdateMaxRetryTimes update cnr retry time.
    53  	cnrUpdateMaxRetryTimes = 3
    54  )
    55  
    56  const (
    57  	refreshLatestCNRJitterFactor = 0.5
    58  )
    59  
    60  const (
    61  	metricsNameRefreshCNRCost            = "refresh_cnr_cost"
    62  	metricsNameUpdateCNRCost             = "update_cnr_cost"
    63  	metricsNameUpdateCNRSpecMetadataCost = "update_cnr_spec_metadata_cost"
    64  	metricsNameUpdateCNRStatusCost       = "update_cnr_status_cost"
    65  )
    66  
    67  // cnrReporterImpl is to report cnr content to remote
    68  type cnrReporterImpl struct {
    69  	cnrName string
    70  
    71  	// defaultLabels contains the default config for CNR created by reporter
    72  	defaultLabels map[string]string
    73  	// latestUpdatedCNR is used as an in-memory cache for CNR;
    74  	// whenever CNR info is needed, get from this cache firstly
    75  	latestUpdatedCNR *nodev1alpha1.CustomNodeResource
    76  	mux              sync.Mutex
    77  
    78  	notifiers map[string]metaservercnr.CNRNotifier
    79  
    80  	client  clientset.Interface
    81  	updater control.CNRControl
    82  	emitter metrics.MetricEmitter
    83  
    84  	mergeValueFunc syntax.MergeValueFunc
    85  
    86  	refreshLatestCNRPeriod time.Duration
    87  }
    88  
    89  // NewCNRReporter create a cnr reporter
    90  func NewCNRReporter(genericClient *client.GenericClientSet, metaServer *metaserver.MetaServer,
    91  	emitter metrics.MetricEmitter, conf *config.Configuration,
    92  ) (reporter.Reporter, error) {
    93  	c := &cnrReporterImpl{
    94  		cnrName:                conf.NodeName,
    95  		refreshLatestCNRPeriod: conf.RefreshLatestCNRPeriod,
    96  		defaultLabels:          conf.DefaultCNRLabels,
    97  		notifiers:              make(map[string]metaservercnr.CNRNotifier),
    98  		emitter:                emitter,
    99  		client:                 genericClient.InternalClient,
   100  		updater:                control.NewCNRControlImpl(genericClient.InternalClient),
   101  	}
   102  	// register itself as a resource reporter in meta-server
   103  	metaServer.SetCNRFetcher(c)
   104  
   105  	c.mergeValueFunc = syntax.SimpleMergeTwoValues
   106  	return c, nil
   107  }
   108  
   109  // Run start cnr reporter
   110  func (c *cnrReporterImpl) Run(ctx context.Context) {
   111  	go wait.JitterUntilWithContext(ctx, c.refreshLatestCNR, c.refreshLatestCNRPeriod, refreshLatestCNRJitterFactor, true)
   112  	<-ctx.Done()
   113  }
   114  
   115  // GetCNR tries to return local cache if exists, otherwise get from APIServer
   116  
   117  func (c *cnrReporterImpl) GetCNR(ctx context.Context) (*nodev1alpha1.CustomNodeResource, error) {
   118  	cnr := c.latestUpdatedCNR.DeepCopy()
   119  	if cnr != nil {
   120  		return cnr, nil
   121  	}
   122  
   123  	return c.client.NodeV1alpha1().CustomNodeResources().Get(ctx, c.cnrName, metav1.GetOptions{ResourceVersion: "0"})
   124  }
   125  
   126  // Update is to update remote cnr according to reported fields
   127  func (c *cnrReporterImpl) Update(ctx context.Context, fields []*v1alpha1.ReportField) error {
   128  	beginWithLock := time.Now()
   129  	c.mux.Lock()
   130  	beginWithoutLock := time.Now()
   131  
   132  	defer func() {
   133  		costs := time.Since(beginWithoutLock)
   134  		klog.InfoS("finished update cnr without lock", "costs", costs)
   135  
   136  		c.mux.Unlock()
   137  
   138  		costs = time.Since(beginWithLock)
   139  		klog.InfoS("finished update cnr with lock", "costs", costs)
   140  		_ = c.emitter.StoreInt64(metricsNameUpdateCNRCost, costs.Microseconds(), metrics.MetricTypeNameRaw)
   141  	}()
   142  
   143  	if klog.V(4).Enabled() {
   144  		for _, f := range fields {
   145  			klog.Infof("field name %s/%s with value %s", f.FieldType, f.FieldName, string(f.Value))
   146  		}
   147  	}
   148  
   149  	for i := 0; i < cnrUpdateMaxRetryTimes; i++ {
   150  		if err := c.tryUpdateCNR(ctx, fields, i); err != nil {
   151  			klog.Errorf("error updating cnr, will retry: %v", err)
   152  		} else {
   153  			return nil
   154  		}
   155  	}
   156  
   157  	return fmt.Errorf("attempt to update cnr failed with total retries of %d", cnrUpdateMaxRetryTimes)
   158  }
   159  
   160  // RegisterNotifier register a notifier to cnr reporter
   161  func (c *cnrReporterImpl) RegisterNotifier(name string, notifier metaservercnr.CNRNotifier) error {
   162  	c.mux.Lock()
   163  	defer c.mux.Unlock()
   164  
   165  	if _, ok := c.notifiers[name]; ok {
   166  		return fmt.Errorf("notifier %s already exists", name)
   167  	}
   168  
   169  	c.notifiers[name] = notifier
   170  	return nil
   171  }
   172  
   173  // UnregisterNotifier unregister a notifier from cnr reporter
   174  func (c *cnrReporterImpl) UnregisterNotifier(name string) error {
   175  	c.mux.Lock()
   176  	defer c.mux.Unlock()
   177  
   178  	if _, ok := c.notifiers[name]; !ok {
   179  		return fmt.Errorf("notifier %s not exists", name)
   180  	}
   181  
   182  	delete(c.notifiers, name)
   183  	return nil
   184  }
   185  
   186  // refreshLatestCNR get latest cnr from remote, because cnr in cache may not have been updated.
   187  func (c *cnrReporterImpl) refreshLatestCNR(ctx context.Context) {
   188  	c.mux.Lock()
   189  	defer c.mux.Unlock()
   190  
   191  	begin := time.Now()
   192  	defer func() {
   193  		costs := time.Since(begin)
   194  		klog.Infof("finished refresh cnr (%v)", costs)
   195  		_ = c.emitter.StoreInt64(metricsNameRefreshCNRCost, costs.Microseconds(), metrics.MetricTypeNameRaw)
   196  	}()
   197  
   198  	cnr, err := c.client.NodeV1alpha1().CustomNodeResources().Get(ctx, c.cnrName, metav1.GetOptions{ResourceVersion: "0"})
   199  	if err == nil {
   200  		c.latestUpdatedCNR = cnr.DeepCopy()
   201  	} else if !c.resetCNRIfNeeded(err) {
   202  		klog.Errorf("refresh local cnr cache failed with error: %v", err)
   203  	}
   204  }
   205  
   206  // tryUpdateCNR update cnr according reported fields, first update cnr try will use cached latestUpdatedCNR,
   207  // if there are some errors such as conflict happened, it will retry by getting cnr from api server
   208  func (c *cnrReporterImpl) tryUpdateCNR(ctx context.Context, fields []*v1alpha1.ReportField, tryIdx int) error {
   209  	var (
   210  		cnr *nodev1alpha1.CustomNodeResource
   211  		err error
   212  	)
   213  
   214  	// only get cnr from api server iff latest updated cnr is nil or tryIdx > 0
   215  	if c.latestUpdatedCNR == nil || tryIdx > 0 {
   216  		c.countMetricsWithBaseTags("reporter_update_retry")
   217  
   218  		cnr, err = c.client.NodeV1alpha1().CustomNodeResources().Get(ctx, c.cnrName, metav1.GetOptions{ResourceVersion: "0"})
   219  		if err != nil && !apierrors.IsNotFound(err) {
   220  			c.countMetricsWithBaseTags("reporter_update_get_failed")
   221  			if c.resetCNRIfNeeded(err) {
   222  				return nil
   223  			}
   224  			return err
   225  		}
   226  
   227  		// NotFound to create cnr
   228  		if err != nil {
   229  			cnr, err = c.createCNR(ctx, fields)
   230  			if err != nil {
   231  				c.countMetricsWithBaseTags("reporter_update_failed")
   232  				return fmt.Errorf("create cnr failed: %s", err)
   233  			}
   234  		}
   235  
   236  		c.latestUpdatedCNR = cnr.DeepCopy()
   237  	} else {
   238  		cnr = c.latestUpdatedCNR.DeepCopy()
   239  	}
   240  
   241  	if cnr == nil {
   242  		return fmt.Errorf("nil %q cnr object", c.cnrName)
   243  	}
   244  
   245  	originCNR := cnr.DeepCopy()
   246  	err = setCNR(cnr, fields, c.mergeValueFunc)
   247  	if err != nil {
   248  		return err
   249  	}
   250  
   251  	// todo: consider whether we need to handle update error automatically
   252  	//  i.e. use queue to push and pop those failed items
   253  
   254  	// try patch spec and metadata first, because the update of cnr will change the ResourceVersion in ObjectMeta
   255  	originCNR, err = c.tryUpdateCNRSpecAndMetadata(ctx, originCNR, cnr)
   256  	if err != nil && !c.resetCNRIfNeeded(err) {
   257  		return err
   258  	} else if err != nil {
   259  		originCNR = c.latestUpdatedCNR.DeepCopy()
   260  	}
   261  
   262  	_, err = c.tryUpdateCNRStatus(ctx, originCNR, cnr)
   263  	if err != nil {
   264  		return err
   265  	}
   266  
   267  	return nil
   268  }
   269  
   270  func (c *cnrReporterImpl) tryUpdateCNRSpecAndMetadata(ctx context.Context,
   271  	originCNR, currentCNR *nodev1alpha1.CustomNodeResource,
   272  ) (*nodev1alpha1.CustomNodeResource, error) {
   273  	var (
   274  		cnr *nodev1alpha1.CustomNodeResource
   275  		err error
   276  	)
   277  
   278  	if cnrSpecHasChanged(&originCNR.Spec, &currentCNR.Spec) || cnrMetadataHasChanged(&originCNR.ObjectMeta, &currentCNR.ObjectMeta) {
   279  		klog.Infof("cnr spec or metadata changed, try to patch it")
   280  
   281  		begin := time.Now()
   282  		defer func() {
   283  			costs := time.Since(begin)
   284  			klog.Infof("finished update cnr spec and metadata (%v)", costs)
   285  			_ = c.emitter.StoreInt64(metricsNameUpdateCNRSpecMetadataCost, costs.Microseconds(), metrics.MetricTypeNameRaw)
   286  		}()
   287  
   288  		// patch cnr spec and metadata
   289  		cnr, err = c.updater.PatchCNRSpecAndMetadata(ctx, c.cnrName, originCNR, currentCNR)
   290  		if err != nil {
   291  			c.countMetricsWithBaseTags("reporter_update",
   292  				metrics.ConvertMapToTags(map[string]string{
   293  					"field":  "spec",
   294  					"status": "failed",
   295  				})...)
   296  			return nil, err
   297  		}
   298  
   299  		c.countMetricsWithBaseTags("reporter_update",
   300  			metrics.ConvertMapToTags(map[string]string{
   301  				"field":  "spec",
   302  				"status": "success",
   303  			})...)
   304  
   305  		klog.Infof("patch cnr spec and metadata success\n old cnr spec: %#v, metadata: %#v,\n "+
   306  			"new cnr spec: %#v, metadata: %#v",
   307  			originCNR.Spec, originCNR.ObjectMeta, cnr.Spec, cnr.ObjectMeta)
   308  		c.latestUpdatedCNR = cnr.DeepCopy()
   309  
   310  		// notify cnr spec and metadata update
   311  		for _, notifier := range c.notifiers {
   312  			notifier.OnCNRUpdate(cnr)
   313  		}
   314  	} else {
   315  		return originCNR, nil
   316  	}
   317  
   318  	return cnr, nil
   319  }
   320  
   321  func (c *cnrReporterImpl) tryUpdateCNRStatus(ctx context.Context,
   322  	originCNR, currentCNR *nodev1alpha1.CustomNodeResource,
   323  ) (*nodev1alpha1.CustomNodeResource, error) {
   324  	var (
   325  		cnr *nodev1alpha1.CustomNodeResource
   326  		err error
   327  	)
   328  
   329  	if cnrStatusHasChanged(&originCNR.Status, &currentCNR.Status) {
   330  		klog.Infof("cnr status changed, try to patch it")
   331  
   332  		begin := time.Now()
   333  		defer func() {
   334  			costs := time.Since(begin)
   335  			klog.Infof("finished update cnr status (%v)", costs)
   336  			_ = c.emitter.StoreInt64(metricsNameUpdateCNRStatusCost, costs.Microseconds(), metrics.MetricTypeNameRaw)
   337  		}()
   338  
   339  		// patch cnr status
   340  		cnr, err = c.updater.PatchCNRStatus(ctx, c.cnrName, originCNR, currentCNR)
   341  		if err != nil {
   342  			c.countMetricsWithBaseTags("reporter_update",
   343  				metrics.ConvertMapToTags(map[string]string{
   344  					"field":  "status",
   345  					"status": "failed",
   346  				})...)
   347  			return nil, err
   348  		}
   349  
   350  		c.countMetricsWithBaseTags("reporter_update",
   351  			metrics.ConvertMapToTags(map[string]string{
   352  				"field":  "status",
   353  				"status": "success",
   354  			})...)
   355  
   356  		klog.Infof("patch cnr status success old status: %#v,\n new status: %#v", originCNR.Status, cnr.Status)
   357  		c.latestUpdatedCNR = cnr.DeepCopy()
   358  
   359  		// notify cnr status update
   360  		for _, notifier := range c.notifiers {
   361  			notifier.OnCNRStatusUpdate(cnr)
   362  		}
   363  	} else {
   364  		return originCNR, nil
   365  	}
   366  
   367  	return cnr, nil
   368  }
   369  
   370  // resetCNRIfNeeded reset cnr if unmarshal type error, it will initialize
   371  // local cnr cache to make sure the content of cnr always is true
   372  // todo if $ref is supported in CRD, we can skip this since api-server will help with validations
   373  func (c *cnrReporterImpl) resetCNRIfNeeded(err error) bool {
   374  	if general.IsUnmarshalTypeError(err) {
   375  		c.latestUpdatedCNR = c.defaultCNR()
   376  		klog.Infof("success re-initialize local cnr cache")
   377  		return true
   378  	}
   379  
   380  	return false
   381  }
   382  
   383  func (c *cnrReporterImpl) defaultCNR() *nodev1alpha1.CustomNodeResource {
   384  	return &nodev1alpha1.CustomNodeResource{
   385  		ObjectMeta: metav1.ObjectMeta{
   386  			Name:   c.cnrName,
   387  			Labels: c.defaultLabels,
   388  		},
   389  	}
   390  }
   391  
   392  func (c *cnrReporterImpl) createCNR(ctx context.Context, fields []*v1alpha1.ReportField) (*nodev1alpha1.CustomNodeResource, error) {
   393  	cnr := c.defaultCNR()
   394  
   395  	err := setCNR(cnr, fields, c.mergeValueFunc)
   396  	if err != nil {
   397  		return nil, fmt.Errorf("set cnr failed: %s", err)
   398  	}
   399  
   400  	klog.Infof("try to create cnr: %#v", cnr)
   401  
   402  	cnr, err = c.client.NodeV1alpha1().CustomNodeResources().Create(ctx, cnr, metav1.CreateOptions{})
   403  	if err != nil {
   404  		return cnr, err
   405  	}
   406  
   407  	return cnr, nil
   408  }
   409  
   410  func setCNR(cnr *nodev1alpha1.CustomNodeResource, fields []*v1alpha1.ReportField,
   411  	mergeFunc func(src reflect.Value, dst reflect.Value) error,
   412  ) error {
   413  	var errList []error
   414  	initializedFields := sets.String{}
   415  	for _, f := range fields {
   416  		if f == nil {
   417  			continue
   418  		}
   419  
   420  		// initialize need report cnr field first
   421  		if !initializedFields.Has(f.FieldName) {
   422  			err := initializeFieldToCNR(cnr, *f)
   423  			if err != nil {
   424  				errList = append(errList, err)
   425  				continue
   426  			}
   427  
   428  			initializedFields.Insert(f.FieldName)
   429  		}
   430  
   431  		// parse report field to cnr by merge function
   432  		_, err := parseReportFieldToCNR(cnr, *f, mergeFunc)
   433  		if err != nil {
   434  			errList = append(errList, err)
   435  			continue
   436  		}
   437  	}
   438  
   439  	if len(errList) > 0 {
   440  		return errors.NewAggregate(errList)
   441  	}
   442  
   443  	if err := reviseCNR(cnr); err != nil {
   444  		return err
   445  	}
   446  
   447  	return nil
   448  }
   449  
   450  // reviseCNR revises the field of cnr to make sure it is not redundant
   451  func reviseCNR(cnr *nodev1alpha1.CustomNodeResource) error {
   452  	if cnr == nil {
   453  		return nil
   454  	}
   455  
   456  	// merge all topology zones
   457  	cnr.Status.TopologyZone = util.MergeTopologyZone(nil, cnr.Status.TopologyZone)
   458  	return nil
   459  }
   460  
   461  func (c *cnrReporterImpl) countMetricsWithBaseTags(key string, tags ...metrics.MetricTag) {
   462  	tags = append(tags,
   463  		metrics.ConvertMapToTags(map[string]string{
   464  			"reporterName": cnrReporterName,
   465  		})...)
   466  
   467  	_ = c.emitter.StoreInt64(key, 1, metrics.MetricTypeNameCount, tags...)
   468  }
   469  
   470  // initializeFieldToCNR initialize cnr fields to nil
   471  func initializeFieldToCNR(cnr *nodev1alpha1.CustomNodeResource, field v1alpha1.ReportField) error {
   472  	// get need report value of cnr
   473  	originValue, err := getCNRField(cnr, field)
   474  	if err != nil {
   475  		return err
   476  	}
   477  
   478  	originValue.Set(reflect.New(originValue.Type()).Elem())
   479  	return nil
   480  }
   481  
   482  // parseReportFieldToCNR parse reportField and merge to origin cnr by mergeFunc
   483  func parseReportFieldToCNR(cnr *nodev1alpha1.CustomNodeResource, reportField v1alpha1.ReportField,
   484  	mergeFunc func(src reflect.Value, dst reflect.Value) error,
   485  ) (*nodev1alpha1.CustomNodeResource, error) {
   486  	if cnr == nil {
   487  		return nil, fmt.Errorf("cnr is nil")
   488  	}
   489  
   490  	// get need report value of cnr
   491  	originValue, err := getCNRField(cnr, reportField)
   492  	if err != nil {
   493  		return nil, err
   494  	}
   495  
   496  	// parse report value to base field type
   497  	reportValue, err := syntax.ParseBytesByType(reportField.Value, originValue.Type())
   498  	if err != nil || !reportValue.IsValid() {
   499  		return nil, fmt.Errorf("report %s with value %s is invald with err: %s", reportField.FieldName, string(reportField.Value), err)
   500  	}
   501  
   502  	err = mergeFunc(reportValue, originValue)
   503  	if err != nil {
   504  		return nil, err
   505  	}
   506  
   507  	return cnr, nil
   508  }
   509  
   510  // getCNRField only support to parse first-level fields in cnr now;
   511  // todo: support to parse nested fields in the future.
   512  func getCNRField(cnr *nodev1alpha1.CustomNodeResource, reportField v1alpha1.ReportField) (reflect.Value, error) {
   513  	var el reflect.Value
   514  	switch reportField.FieldType {
   515  	case v1alpha1.FieldType_Status:
   516  		el = reflect.ValueOf(&cnr.Status)
   517  	case v1alpha1.FieldType_Spec:
   518  		el = reflect.ValueOf(&cnr.Spec)
   519  	case v1alpha1.FieldType_Metadata:
   520  		el = reflect.ValueOf(cnr)
   521  	default:
   522  		return reflect.Value{}, fmt.Errorf("not support field type %s", reportField.FieldType)
   523  	}
   524  
   525  	if el.Kind() == reflect.Ptr {
   526  		el = el.Elem()
   527  	}
   528  
   529  	// find origin value by field name
   530  	field := el.FieldByName(reportField.FieldName)
   531  	if !field.IsValid() {
   532  		return reflect.Value{}, fmt.Errorf("field %s is invalid", reportField.FieldName)
   533  	}
   534  
   535  	return field, nil
   536  }
   537  
   538  func cnrMetadataHasChanged(originMeta *metav1.ObjectMeta, meta *metav1.ObjectMeta) bool {
   539  	return !apiequality.Semantic.DeepEqual(originMeta, meta)
   540  }
   541  
   542  func cnrSpecHasChanged(originSpec *nodev1alpha1.CustomNodeResourceSpec, spec *nodev1alpha1.CustomNodeResourceSpec) bool {
   543  	return !apiequality.Semantic.DeepEqual(originSpec, spec)
   544  }
   545  
   546  func cnrStatusHasChanged(originStatus *nodev1alpha1.CustomNodeResourceStatus, status *nodev1alpha1.CustomNodeResourceStatus) bool {
   547  	return !apiequality.Semantic.DeepEqual(originStatus, status)
   548  }