github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/kcc/manager.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kcc
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"sync"
    24  	"time"
    25  
    26  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    27  	"k8s.io/apimachinery/pkg/api/meta"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/runtime"
    30  	"k8s.io/apimachinery/pkg/runtime/schema"
    31  	"k8s.io/apimachinery/pkg/util/errors"
    32  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  	"k8s.io/klog/v2"
    35  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    36  
    37  	"github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1"
    38  	"github.com/kubewharf/katalyst-core/pkg/client"
    39  	pkgconfig "github.com/kubewharf/katalyst-core/pkg/config"
    40  	"github.com/kubewharf/katalyst-core/pkg/config/agent"
    41  	"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic"
    42  	"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/crd"
    43  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnc"
    44  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    45  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    46  	"github.com/kubewharf/katalyst-core/pkg/util/syntax"
    47  )
    48  
    49  const (
    50  	updateConfigInterval     = 3 * time.Second
    51  	updateConfigJitterFactor = 0.5
    52  )
    53  
    54  const (
    55  	metricsNameUpdateConfig   = "metaserver_update_config"
    56  	metricsNameLoadCheckpoint = "metaserver_load_checkpoint"
    57  
    58  	metricsValueStatusCheckpointNotFoundOrCorrupted = "notFoundOrCorrupted"
    59  	metricsValueStatusCheckpointInvalidOrExpired    = "invalidOrExpired"
    60  	metricsValueStatusCheckpointSuccess             = "success"
    61  )
    62  
    63  const (
    64  	configManagerCheckpoint = "config_manager_checkpoint"
    65  )
    66  
    67  var (
    68  	katalystConfigGVRToGVKMap = getGVRToGVKMap()
    69  
    70  	updateConfigBackoff = wait.Backoff{
    71  		Duration: 5 * time.Second,
    72  		Factor:   2,
    73  		Jitter:   0.1,
    74  		Steps:    5,
    75  		Cap:      15 * time.Second,
    76  	}
    77  )
    78  
    79  // ConfigurationManager is a user for ConfigurationLoader working for dynamic configuration manager
    80  type ConfigurationManager interface {
    81  	// InitializeConfig trigger dynamic configuration initialize directly
    82  	InitializeConfig(ctx context.Context) error
    83  	// AddConfigWatcher add gvr to list which will be watched to get dynamic configuration
    84  	AddConfigWatcher(gvrs ...metav1.GroupVersionResource) error
    85  	// Run starts the main loop
    86  	Run(ctx context.Context)
    87  }
    88  
    89  type DummyConfigurationManager struct{}
    90  
    91  func (d *DummyConfigurationManager) InitializeConfig(_ context.Context) error {
    92  	return nil
    93  }
    94  
    95  func (d *DummyConfigurationManager) AddConfigWatcher(_ ...metav1.GroupVersionResource) error {
    96  	return nil
    97  }
    98  
    99  func (d *DummyConfigurationManager) Run(_ context.Context) {}
   100  
   101  var _ ConfigurationManager = &DynamicConfigManager{}
   102  
   103  // DynamicConfigManager is to fetch dynamic config from remote
   104  type DynamicConfigManager struct {
   105  	// defaultConfig is used to store the static configuration parsed from flags
   106  	// currentConfig merges default conf with dynamic conf (defined in kcc); and
   107  	// the dynamic conf is used as an incremental way.
   108  	conf          *agent.AgentConfiguration
   109  	defaultConfig *dynamic.Configuration
   110  
   111  	// lastDynamicConfigCRD is used to record the last dynamic config CRD
   112  	// to avoid unnecessary update
   113  	lastDynamicConfigCRD *crd.DynamicConfigCRD
   114  
   115  	configLoader ConfigurationLoader
   116  	emitter      metrics.MetricEmitter
   117  
   118  	// resourceGVRMap records those GVR that should be interested
   119  	// gvrToKind maps from GVR to GVK (only kind can be used to reflect objects)
   120  	mux            sync.RWMutex
   121  	resourceGVRMap map[string]metav1.GroupVersionResource
   122  
   123  	// checkpoint stores recent dynamic config
   124  	checkpointManager   checkpointmanager.CheckpointManager
   125  	checkpointGraceTime time.Duration
   126  }
   127  
   128  // NewDynamicConfigManager new a dynamic config manager use katalyst custom config sdk.
   129  func NewDynamicConfigManager(clientSet *client.GenericClientSet, emitter metrics.MetricEmitter,
   130  	cncFetcher cnc.CNCFetcher, conf *pkgconfig.Configuration,
   131  ) (ConfigurationManager, error) {
   132  	configLoader := NewKatalystCustomConfigLoader(clientSet, conf.ConfigCacheTTL, cncFetcher)
   133  
   134  	checkpointManager, err := checkpointmanager.NewCheckpointManager(conf.CheckpointManagerDir)
   135  	if err != nil {
   136  		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
   137  	}
   138  
   139  	return &DynamicConfigManager{
   140  		conf:                conf.AgentConfiguration,
   141  		defaultConfig:       deepCopy(conf.GetDynamicConfiguration()),
   142  		configLoader:        configLoader,
   143  		emitter:             emitter,
   144  		resourceGVRMap:      make(map[string]metav1.GroupVersionResource),
   145  		checkpointManager:   checkpointManager,
   146  		checkpointGraceTime: conf.ConfigCheckpointGraceTime,
   147  	}, nil
   148  }
   149  
   150  // AddConfigWatcher add gvr to list which will be watched to get dynamic configuration
   151  func (c *DynamicConfigManager) AddConfigWatcher(gvrs ...metav1.GroupVersionResource) error {
   152  	c.mux.Lock()
   153  	defer c.mux.Unlock()
   154  
   155  	for _, gvr := range gvrs {
   156  		if oldGVR, ok := c.resourceGVRMap[gvr.Resource]; ok && gvr != oldGVR {
   157  			return fmt.Errorf("resource %s already reggistered by gvrs %s which is different with %s",
   158  				gvr.Resource, oldGVR.String(), gvr.String())
   159  		}
   160  
   161  		c.resourceGVRMap[gvr.Resource] = gvr
   162  	}
   163  
   164  	return nil
   165  }
   166  
   167  // Run is to start update config loops until the context is done
   168  func (c *DynamicConfigManager) Run(ctx context.Context) {
   169  	go wait.JitterUntilWithContext(ctx, func(context.Context) {
   170  		if err := c.tryUpdateConfig(ctx, true); err != nil {
   171  			klog.Errorf("try update config error: %v", err)
   172  		}
   173  	}, updateConfigInterval, updateConfigJitterFactor, true)
   174  	<-ctx.Done()
   175  }
   176  
   177  // InitializeConfig will try to initialize dynamic config
   178  func (c *DynamicConfigManager) InitializeConfig(ctx context.Context) error {
   179  	err := wait.ExponentialBackoff(updateConfigBackoff, func() (bool, error) {
   180  		err := c.tryUpdateConfig(ctx, false)
   181  		if err == nil {
   182  			return true, nil
   183  		}
   184  
   185  		if c.conf.ConfigSkipFailedInitialization {
   186  			klog.Warningf("unable to update dynamic config: %v, fallback to default config", err)
   187  			return true, nil
   188  		}
   189  
   190  		klog.Errorf("unable to update dynamic config: %v, back off to retry", err)
   191  		return false, nil
   192  	})
   193  
   194  	return err
   195  }
   196  
   197  func (c *DynamicConfigManager) tryUpdateConfig(ctx context.Context, skipError bool) error {
   198  	c.mux.RLock()
   199  	defer c.mux.RUnlock()
   200  
   201  	err := c.updateConfig(ctx)
   202  	if err != nil {
   203  		_ = c.emitter.StoreInt64(metricsNameUpdateConfig, 1, metrics.MetricTypeNameCount, metrics.MetricTag{
   204  			Key: "status", Val: "failed",
   205  		})
   206  
   207  		// return an error if skipError is false to make sure the config is correct at startup
   208  		if !skipError {
   209  			return err
   210  		}
   211  	} else {
   212  		_ = c.emitter.StoreInt64(metricsNameUpdateConfig, 1, metrics.MetricTypeNameCount, metrics.MetricTag{
   213  			Key: "status", Val: "success",
   214  		})
   215  	}
   216  
   217  	return nil
   218  }
   219  
   220  // updateConfig is used to get dynamic agent config from remote
   221  func (c *DynamicConfigManager) updateConfig(ctx context.Context) error {
   222  	dynamicConfigCRD, success, err := c.updateDynamicConfig(c.resourceGVRMap, katalystConfigGVRToGVKMap,
   223  		func(gvr metav1.GroupVersionResource, conf interface{}) error {
   224  			return c.configLoader.LoadConfig(ctx, gvr, conf)
   225  		},
   226  	)
   227  	if !success {
   228  		return err
   229  	} else if apiequality.Semantic.DeepEqual(c.lastDynamicConfigCRD, dynamicConfigCRD) {
   230  		klog.V(4).Infof("dynamic config is not changed")
   231  		return nil
   232  	}
   233  
   234  	klog.Infof("dynamic config crd is changed from %v to %v", c.lastDynamicConfigCRD, dynamicConfigCRD)
   235  	currentConfig := deepCopy(c.defaultConfig)
   236  	applyDynamicConfig(currentConfig, dynamicConfigCRD)
   237  	c.conf.SetDynamicConfiguration(currentConfig)
   238  	c.lastDynamicConfigCRD = dynamicConfigCRD
   239  	return err
   240  }
   241  
   242  func (c *DynamicConfigManager) writeCheckpoint(kind string, configData reflect.Value) {
   243  	// read checkpoint to get config data related to other gvr
   244  	data, err := c.readCheckpoint()
   245  	if err != nil {
   246  		klog.Errorf("load checkpoint from %q failed: %v, try to overwrite it", configManagerCheckpoint, err)
   247  		_ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameCount, []metrics.MetricTag{
   248  			{Key: "status", Val: metricsValueStatusCheckpointNotFoundOrCorrupted},
   249  			{Key: "kind", Val: kind},
   250  		}...)
   251  	}
   252  
   253  	// checkpoint doesn't exist or became corrupted, make a new checkpoint
   254  	if data == nil {
   255  		data = NewCheckpoint(make(map[string]TargetConfigData))
   256  	}
   257  
   258  	// set config value and timestamp for kind
   259  	data.SetData(kind, configData, metav1.Now())
   260  	err = c.checkpointManager.CreateCheckpoint(configManagerCheckpoint, data)
   261  	if err != nil {
   262  		klog.Errorf("failed to write checkpoint file %q: %v", configManagerCheckpoint, err)
   263  	}
   264  }
   265  
   266  func (c *DynamicConfigManager) readCheckpoint() (ConfigManagerCheckpoint, error) {
   267  	configResponses := make(map[string]TargetConfigData)
   268  	cp := NewCheckpoint(configResponses)
   269  	err := c.checkpointManager.GetCheckpoint(configManagerCheckpoint, cp)
   270  	if err != nil {
   271  		return nil, err
   272  	}
   273  
   274  	return cp, nil
   275  }
   276  
   277  func (c *DynamicConfigManager) updateDynamicConfig(resourceGVRMap map[string]metav1.GroupVersionResource,
   278  	gvrToKind map[schema.GroupVersionResource]schema.GroupVersionKind,
   279  	loader func(gvr metav1.GroupVersionResource, conf interface{}) error,
   280  ) (*crd.DynamicConfigCRD, bool, error) {
   281  	dynamicConfiguration := &crd.DynamicConfigCRD{}
   282  	success := false
   283  
   284  	var errList []error
   285  	for _, gvr := range resourceGVRMap {
   286  		schemaGVR := native.ToSchemaGVR(gvr.Group, gvr.Version, gvr.Resource)
   287  		kind, ok := gvrToKind[schemaGVR]
   288  		if !ok {
   289  			errList = append(errList, fmt.Errorf("gvk of gvr %s is not found", gvr))
   290  			continue
   291  		}
   292  
   293  		// get target dynamic config configField by kind
   294  		configField := reflect.ValueOf(dynamicConfiguration).Elem().FieldByName(kind.Kind)
   295  
   296  		// create a new instance of this configField type
   297  		newConfigData := reflect.New(configField.Type().Elem())
   298  		err := loader(gvr, newConfigData.Interface())
   299  		if err != nil {
   300  			klog.Warningf("failed to load targetConfigMeta from targetConfigMeta fetcher: %s", err)
   301  			// get target dynamic configField value from checkpoint
   302  			data, err := c.readCheckpoint()
   303  			if err != nil {
   304  				_ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameRaw, []metrics.MetricTag{
   305  					{Key: "status", Val: metricsValueStatusCheckpointNotFoundOrCorrupted},
   306  					{Key: "kind", Val: kind.Kind},
   307  				}...)
   308  				errList = append(errList, fmt.Errorf("failed to get targetConfigMeta from checkpoint"))
   309  				continue
   310  			} else {
   311  				configData, timestamp := data.GetData(kind.Kind)
   312  				if configData.Kind() == reflect.Ptr && !configData.IsNil() &&
   313  					time.Now().Before(timestamp.Add(c.checkpointGraceTime)) {
   314  					newConfigData = configData
   315  					klog.Infof("failed to load targetConfigMeta from remote, use local checkpoint instead")
   316  					_ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameRaw, []metrics.MetricTag{
   317  						{Key: "status", Val: metricsValueStatusCheckpointSuccess},
   318  						{Key: "kind", Val: kind.Kind},
   319  					}...)
   320  				} else {
   321  					_ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameRaw, []metrics.MetricTag{
   322  						{Key: "status", Val: metricsValueStatusCheckpointInvalidOrExpired},
   323  						{Key: "kind", Val: kind.Kind},
   324  					}...)
   325  					errList = append(errList, fmt.Errorf("checkpoint data for gvr %v is empty or out of date", gvr.String()))
   326  					continue
   327  				}
   328  			}
   329  		}
   330  
   331  		// set target dynamic configField by new config field
   332  		configField.Set(newConfigData)
   333  		success = true
   334  		c.writeCheckpoint(kind.Kind, newConfigData)
   335  	}
   336  
   337  	return dynamicConfiguration, success, errors.NewAggregate(errList)
   338  }
   339  
   340  func getGVRToGVKMap() map[schema.GroupVersionResource]schema.GroupVersionKind {
   341  	scheme := runtime.NewScheme()
   342  	utilruntime.Must(v1alpha1.AddToScheme(scheme))
   343  
   344  	knownTypes := scheme.AllKnownTypes()
   345  	gvrToKind := make(map[schema.GroupVersionResource]schema.GroupVersionKind)
   346  	for kind := range knownTypes {
   347  		plural, singular := meta.UnsafeGuessKindToResource(kind)
   348  		gvrToKind[plural] = kind
   349  		gvrToKind[singular] = kind
   350  	}
   351  	return gvrToKind
   352  }
   353  
   354  func applyDynamicConfig(config *dynamic.Configuration,
   355  	dynamicConfigCRD *crd.DynamicConfigCRD,
   356  ) {
   357  	config.ApplyConfiguration(dynamicConfigCRD)
   358  }
   359  
   360  func deepCopy(src *dynamic.Configuration) *dynamic.Configuration {
   361  	return syntax.DeepCopy(src).(*dynamic.Configuration)
   362  }