github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/spd/fetcher.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package spd
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"go.uber.org/atomic"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apimachinery/pkg/util/wait"
    29  	"k8s.io/client-go/tools/cache"
    30  	"k8s.io/klog/v2"
    31  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    32  
    33  	configapis "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1"
    34  	workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1"
    35  	"github.com/kubewharf/katalyst-core/pkg/client"
    36  	pkgconfig "github.com/kubewharf/katalyst-core/pkg/config"
    37  	"github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnc"
    38  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    39  	"github.com/kubewharf/katalyst-core/pkg/util"
    40  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    41  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    42  )
    43  
    44  const (
    45  	defaultClearUnusedSPDPeriod = 12 * time.Hour
    46  	defaultMaxRetryCount        = 3
    47  	defaultJitterFactor         = 1
    48  )
    49  
    50  const (
    51  	metricsNameGetCNCTargetConfigFailed = "spd_manager_get_cnc_target_failed"
    52  	metricsNameUpdateCacheFailed        = "spd_manager_update_cache_failed"
    53  	metricsNameCacheNotFound            = "spd_manager_cache_not_found"
    54  	metricsNameUpdateCacheSuccess       = "spd_manager_update_cache_success"
    55  	metricsNameDeleteCache              = "spd_manager_delete_cache"
    56  )
    57  
    58  type GetPodSPDNameFunc func(_ metav1.ObjectMeta) (string, error)
    59  
    60  type SPDFetcher interface {
    61  	// GetSPD get spd for given pod
    62  	GetSPD(ctx context.Context, podMeta metav1.ObjectMeta) (*workloadapis.ServiceProfileDescriptor, error)
    63  
    64  	// Run async loop to clear unused spd
    65  	Run(ctx context.Context)
    66  }
    67  
    68  type DummySPDFetcher struct {
    69  	SPD *workloadapis.ServiceProfileDescriptor
    70  }
    71  
    72  func (d DummySPDFetcher) GetSPD(_ context.Context, _ metav1.ObjectMeta) (*workloadapis.ServiceProfileDescriptor, error) {
    73  	return d.SPD, nil
    74  }
    75  
    76  func (d DummySPDFetcher) Run(_ context.Context) {
    77  	return
    78  }
    79  
    80  type spdFetcher struct {
    81  	started *atomic.Bool
    82  	mux     sync.Mutex
    83  
    84  	client            *client.GenericClientSet
    85  	emitter           metrics.MetricEmitter
    86  	cncFetcher        cnc.CNCFetcher
    87  	checkpointManager checkpointmanager.CheckpointManager
    88  	getPodSPDNameFunc GetPodSPDNameFunc
    89  
    90  	// spdCache is a cache of namespace/name to current target spd
    91  	spdCache *Cache
    92  }
    93  
    94  // NewSPDFetcher creates a spd manager to implement SPDFetcher
    95  func NewSPDFetcher(clientSet *client.GenericClientSet, emitter metrics.MetricEmitter,
    96  	cncFetcher cnc.CNCFetcher, conf *pkgconfig.Configuration,
    97  ) (SPDFetcher, error) {
    98  	checkpointManager, err := checkpointmanager.NewCheckpointManager(conf.CheckpointManagerDir)
    99  	if err != nil {
   100  		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
   101  	}
   102  
   103  	m := &spdFetcher{
   104  		started:           atomic.NewBool(false),
   105  		client:            clientSet,
   106  		emitter:           emitter,
   107  		checkpointManager: checkpointManager,
   108  		cncFetcher:        cncFetcher,
   109  	}
   110  
   111  	m.getPodSPDNameFunc = util.GetPodSPDName
   112  	m.spdCache, err = NewSPDCache(checkpointManager, conf.ServiceProfileSkipCorruptionError, conf.ServiceProfileCacheTTL,
   113  		defaultClearUnusedSPDPeriod, defaultMaxRetryCount, defaultJitterFactor)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  
   118  	return m, nil
   119  }
   120  
   121  func (s *spdFetcher) GetSPD(ctx context.Context, podMeta metav1.ObjectMeta) (*workloadapis.ServiceProfileDescriptor, error) {
   122  	spdName, err := s.getPodSPDNameFunc(podMeta)
   123  	if err != nil {
   124  		general.Warningf("get spd for pod (%v/%v) err %v", podMeta.Namespace, podMeta.Name, err)
   125  		return nil, errors.NewNotFound(workloadapis.Resource(workloadapis.ResourceNameServiceProfileDescriptors), fmt.Sprintf("for pod(%v/%v)", podMeta.Namespace, podMeta.Name))
   126  	}
   127  
   128  	return s.getSPDByNamespaceName(ctx, podMeta.GetNamespace(), spdName)
   129  }
   130  
   131  // SetGetPodSPDNameFunc set get spd name function to override default getPodSPDNameFunc before started
   132  func (s *spdFetcher) SetGetPodSPDNameFunc(f GetPodSPDNameFunc) {
   133  	if s.started.Load() {
   134  		klog.Warningf("spd manager has already started, not allowed to set implementations")
   135  		return
   136  	}
   137  
   138  	s.getPodSPDNameFunc = f
   139  }
   140  
   141  func (s *spdFetcher) Run(ctx context.Context) {
   142  	if s.started.Swap(true) {
   143  		return
   144  	}
   145  
   146  	go s.spdCache.Run(ctx)
   147  	go wait.UntilWithContext(ctx, s.sync, 30*time.Second)
   148  	<-ctx.Done()
   149  }
   150  
   151  func (s *spdFetcher) getSPDByNamespaceName(_ context.Context, namespace, name string) (*workloadapis.ServiceProfileDescriptor, error) {
   152  	key := native.GenerateNamespaceNameKey(namespace, name)
   153  	baseTag := []metrics.MetricTag{
   154  		{Key: "spdNamespace", Val: namespace},
   155  		{Key: "spdName", Val: name},
   156  	}
   157  
   158  	// get current spd from cache
   159  	currentSPD := s.spdCache.GetSPD(key, true)
   160  	if currentSPD != nil {
   161  		return currentSPD, nil
   162  	}
   163  
   164  	_ = s.emitter.StoreInt64(metricsNameCacheNotFound, 1, metrics.MetricTypeNameCount, baseTag...)
   165  
   166  	return nil, errors.NewNotFound(workloadapis.Resource(workloadapis.ResourceNameServiceProfileDescriptors), name)
   167  }
   168  
   169  // getSPDTargetConfig get spd target config from cnc
   170  func (s *spdFetcher) getSPDTargetConfig(ctx context.Context, namespace, name string) (*configapis.TargetConfig, error) {
   171  	currentCNC, err := s.cncFetcher.GetCNC(ctx)
   172  	if err != nil {
   173  		return &configapis.TargetConfig{}, err
   174  	}
   175  
   176  	for _, target := range currentCNC.Status.ServiceProfileConfigList {
   177  		if target.ConfigNamespace == namespace && target.ConfigName == name {
   178  			return &target, nil
   179  		}
   180  	}
   181  
   182  	return nil, fmt.Errorf("get target spd %s/%s not found", namespace, name)
   183  }
   184  
   185  func (s *spdFetcher) sync(ctx context.Context) {
   186  	spdKeys := s.spdCache.ListAllSPDKeys()
   187  	for _, key := range spdKeys {
   188  		namespace, name, err := cache.SplitMetaNamespaceKey(key)
   189  		if err != nil {
   190  			continue
   191  		}
   192  
   193  		baseTag := []metrics.MetricTag{
   194  			{Key: "spdNamespace", Val: namespace},
   195  			{Key: "spdName", Val: name},
   196  		}
   197  
   198  		// first get spd origin spd from local cache
   199  		originSPD := s.spdCache.GetSPD(key, false)
   200  
   201  		// get spd current target config from cnc to limit rate of get remote spd by comparing local spd
   202  		// hash with cnc target config hash, if cnc target config not found it will get remote spd directly
   203  		targetConfig, err := s.getSPDTargetConfig(ctx, namespace, name)
   204  		if err != nil {
   205  			klog.Warningf("[spd-manager] get spd targetConfig config failed: %v, use local cache instead", err)
   206  			targetConfig = &configapis.TargetConfig{
   207  				ConfigNamespace: namespace,
   208  				ConfigName:      name,
   209  			}
   210  			_ = s.emitter.StoreInt64(metricsNameGetCNCTargetConfigFailed, 1, metrics.MetricTypeNameCount, baseTag...)
   211  		}
   212  
   213  		// try to update spd cache from remote if cache spd hash is not equal to target config hash,
   214  		// the rate of getting remote spd will be limited by spd ServiceProfileCacheTTL
   215  		err = s.updateSPDCacheIfNeed(ctx, originSPD, targetConfig)
   216  		if err != nil {
   217  			klog.Errorf("[spd-manager] failed update spd cache from remote: %v, use local cache instead", err)
   218  			_ = s.emitter.StoreInt64(metricsNameUpdateCacheFailed, 1, metrics.MetricTypeNameCount, baseTag...)
   219  		}
   220  	}
   221  }
   222  
   223  // updateSPDCacheIfNeed checks if the previous spd has changed, and
   224  // re-get from APIServer if the previous is out-of date.
   225  func (s *spdFetcher) updateSPDCacheIfNeed(ctx context.Context, originSPD *workloadapis.ServiceProfileDescriptor,
   226  	targetConfig *configapis.TargetConfig,
   227  ) error {
   228  	if originSPD == nil && targetConfig == nil {
   229  		return nil
   230  	}
   231  
   232  	now := time.Now()
   233  	if originSPD == nil || util.GetSPDHash(originSPD) != targetConfig.Hash {
   234  		key := native.GenerateNamespaceNameKey(targetConfig.ConfigNamespace, targetConfig.ConfigName)
   235  		// Skip the backoff delay if the configuration hash of the CNC target changes, ensuring the
   236  		// local SPD cache is always updated with the latest configuration.
   237  		if nextFetchRemoteTime := s.spdCache.GetNextFetchRemoteTime(key, now, targetConfig.Hash != ""); nextFetchRemoteTime.After(time.Now()) {
   238  			return nil
   239  		} else {
   240  			// first update the timestamp of the last attempt to fetch the remote spd to
   241  			// avoid frequent requests to the api-server in some bad situations
   242  			s.spdCache.SetLastFetchRemoteTime(key, now)
   243  		}
   244  
   245  		baseTag := []metrics.MetricTag{
   246  			{Key: "spdNamespace", Val: targetConfig.ConfigNamespace},
   247  			{Key: "spdName", Val: targetConfig.ConfigName},
   248  		}
   249  
   250  		klog.Infof("[spd-manager] spd %s targetConfig hash is changed from %s to %s", key, util.GetSPDHash(originSPD), targetConfig.Hash)
   251  		spd, err := s.client.InternalClient.WorkloadV1alpha1().ServiceProfileDescriptors(targetConfig.ConfigNamespace).
   252  			Get(ctx, targetConfig.ConfigName, metav1.GetOptions{ResourceVersion: "0"})
   253  		if err != nil && !errors.IsNotFound(err) {
   254  			return fmt.Errorf("get spd %s from remote failed: %v", key, err)
   255  		} else if err != nil {
   256  			_ = s.emitter.StoreInt64(metricsNameDeleteCache, 1, metrics.MetricTypeNameCount, baseTag...)
   257  			err = s.spdCache.DeleteSPD(key)
   258  			if err != nil {
   259  				return fmt.Errorf("delete spd %s from cache failed: %v", key, err)
   260  			}
   261  
   262  			klog.Infof("[spd-manager] spd %s cache has been deleted", key)
   263  			return nil
   264  		}
   265  
   266  		_ = s.emitter.StoreInt64(metricsNameUpdateCacheSuccess, 1, metrics.MetricTypeNameCount, baseTag...)
   267  
   268  		err = s.spdCache.SetSPD(key, spd)
   269  		if err != nil {
   270  			return err
   271  		}
   272  		klog.Infof("[spd-manager] spd %s cache has been updated to %v", key, spd)
   273  	}
   274  
   275  	return nil
   276  }