k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/cm/dra/plugin/noderesources.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/cm/dra/plugin/noderesources.go (about)

     1  /*
     2  Copyright 2024 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package plugin
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/google/go-cmp/cmp"
    28  	"google.golang.org/grpc/codes"
    29  	"google.golang.org/grpc/status"
    30  
    31  	v1 "k8s.io/api/core/v1"
    32  	resourceapi "k8s.io/api/resource/v1alpha2"
    33  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    36  	"k8s.io/apimachinery/pkg/util/sets"
    37  	resourceinformers "k8s.io/client-go/informers/resource/v1alpha2"
    38  	"k8s.io/client-go/kubernetes"
    39  	"k8s.io/client-go/tools/cache"
    40  	"k8s.io/client-go/util/workqueue"
    41  	"k8s.io/klog/v2"
    42  	drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
    43  	"k8s.io/utils/ptr"
    44  )
    45  
    46  const (
    47  	// resyncPeriod for informer
    48  	// TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable?
    49  	resyncPeriod = time.Duration(10 * time.Minute)
    50  )
    51  
    52  // nodeResourcesController collects resource information from all registered
    53  // plugins and synchronizes that information with ResourceSlice objects.
    54  type nodeResourcesController struct {
    55  	ctx        context.Context
    56  	kubeClient kubernetes.Interface
    57  	getNode    func() (*v1.Node, error)
    58  	wg         sync.WaitGroup
    59  	queue      workqueue.TypedRateLimitingInterface[string]
    60  	sliceStore cache.Store
    61  
    62  	mutex         sync.RWMutex
    63  	activePlugins map[string]*activePlugin
    64  }
    65  
    66  // activePlugin holds the resource information about one plugin
    67  // and the gRPC stream that is used to retrieve that. The context
    68  // used by that stream can be canceled separately to stop
    69  // the monitoring.
    70  type activePlugin struct {
    71  	// cancel is the function which cancels the monitorPlugin goroutine
    72  	// for this plugin.
    73  	cancel func(reason error)
    74  
    75  	// resources is protected by the nodeResourcesController read/write lock.
    76  	// When receiving updates from the driver, the entire slice gets replaced,
    77  	// so it is okay to not do a deep copy of it. Only retrieving the slice
    78  	// must be protected by a read lock.
    79  	resources []*resourceapi.ResourceModel
    80  }
    81  
    82  // startNodeResourcesController constructs a new controller and starts it.
    83  //
    84  // If a kubeClient is provided, then it synchronizes ResourceSlices
    85  // with the resource information provided by plugins. Without it,
    86  // the controller is inactive. This can happen when kubelet is run stand-alone
    87  // without an apiserver. In that case we can't and don't need to publish
    88  // ResourceSlices.
    89  func startNodeResourcesController(ctx context.Context, kubeClient kubernetes.Interface, getNode func() (*v1.Node, error)) *nodeResourcesController {
    90  	if kubeClient == nil {
    91  		return nil
    92  	}
    93  
    94  	logger := klog.FromContext(ctx)
    95  	logger = klog.LoggerWithName(logger, "node resources controller")
    96  	ctx = klog.NewContext(ctx, logger)
    97  
    98  	c := &nodeResourcesController{
    99  		ctx:        ctx,
   100  		kubeClient: kubeClient,
   101  		getNode:    getNode,
   102  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
   103  			workqueue.DefaultTypedControllerRateLimiter[string](),
   104  			workqueue.TypedRateLimitingQueueConfig[string]{Name: "node_resource_slices"},
   105  		),
   106  		activePlugins: make(map[string]*activePlugin),
   107  	}
   108  
   109  	c.wg.Add(1)
   110  	go func() {
   111  		defer c.wg.Done()
   112  		c.run(ctx)
   113  	}()
   114  
   115  	return c
   116  }
   117  
   118  // waitForStop blocks until all background activity spawned by
   119  // the controller has stopped. The context passed to start must
   120  // be canceled for that to happen.
   121  //
   122  // Not needed at the moment, but if it was, this is what it would
   123  // look like...
   124  // func (c *nodeResourcesController) waitForStop() {
   125  // 	if c == nil {
   126  // 		return
   127  // 	}
   128  //
   129  // 	c.wg.Wait()
   130  // }
   131  
   132  // addPlugin is called whenever a plugin has been (re-)registered.
   133  func (c *nodeResourcesController) addPlugin(driverName string, pluginInstance *plugin) {
   134  	if c == nil {
   135  		return
   136  	}
   137  
   138  	klog.FromContext(c.ctx).V(2).Info("Adding plugin", "driverName", driverName)
   139  	c.mutex.Lock()
   140  	defer c.mutex.Unlock()
   141  
   142  	if active := c.activePlugins[driverName]; active != nil {
   143  		active.cancel(errors.New("plugin has re-registered"))
   144  	}
   145  	active := &activePlugin{}
   146  	cancelCtx, cancel := context.WithCancelCause(c.ctx)
   147  	active.cancel = cancel
   148  	c.activePlugins[driverName] = active
   149  	c.queue.Add(driverName)
   150  
   151  	c.wg.Add(1)
   152  	go func() {
   153  		defer c.wg.Done()
   154  		c.monitorPlugin(cancelCtx, active, driverName, pluginInstance)
   155  	}()
   156  }
   157  
   158  // removePlugin is called whenever a plugin has been unregistered.
   159  func (c *nodeResourcesController) removePlugin(driverName string) {
   160  	if c == nil {
   161  		return
   162  	}
   163  
   164  	klog.FromContext(c.ctx).V(2).Info("Removing plugin", "driverName", driverName)
   165  	c.mutex.Lock()
   166  	defer c.mutex.Unlock()
   167  	if active, ok := c.activePlugins[driverName]; ok {
   168  		active.cancel(errors.New("plugin has unregistered"))
   169  		delete(c.activePlugins, driverName)
   170  		c.queue.Add(driverName)
   171  	}
   172  }
   173  
   174  // monitorPlugin calls the plugin to retrieve resource information and caches
   175  // all responses that it gets for processing in the sync method. It keeps
   176  // retrying until an error or EOF response indicates that no further data is
   177  // going to be sent, then watch resources of the plugin stops until it
   178  // re-registers.
   179  func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *activePlugin, driverName string, pluginInstance *plugin) {
   180  	logger := klog.FromContext(ctx)
   181  	logger = klog.LoggerWithValues(logger, "driverName", driverName)
   182  	logger.Info("Starting to monitor node resources of the plugin")
   183  	defer func() {
   184  		r := recover()
   185  		logger.Info("Stopping to monitor node resources of the plugin", "reason", context.Cause(ctx), "err", ctx.Err(), "recover", r)
   186  	}()
   187  
   188  	// Keep trying until canceled.
   189  	for ctx.Err() == nil {
   190  		logger.V(5).Info("Calling NodeListAndWatchResources")
   191  		stream, err := pluginInstance.NodeListAndWatchResources(ctx, new(drapb.NodeListAndWatchResourcesRequest))
   192  		if err != nil {
   193  			switch {
   194  			case status.Convert(err).Code() == codes.Unimplemented:
   195  				// The plugin simply doesn't provide node resources.
   196  				active.cancel(errors.New("plugin does not support node resource reporting"))
   197  			default:
   198  				// This is a problem, report it and retry.
   199  				logger.Error(err, "Creating gRPC stream for node resources failed")
   200  				// TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
   201  				select {
   202  				case <-time.After(5 * time.Second):
   203  				case <-ctx.Done():
   204  				}
   205  			}
   206  			continue
   207  		}
   208  		for {
   209  			response, err := stream.Recv()
   210  			if err != nil {
   211  				switch {
   212  				case errors.Is(err, io.EOF):
   213  					// This is okay. Some plugins might never change their
   214  					// resources after reporting them once.
   215  					active.cancel(errors.New("plugin has closed the stream"))
   216  				case status.Convert(err).Code() == codes.Unimplemented:
   217  					// The plugin has the method, does not really implement it.
   218  					active.cancel(errors.New("plugin does not support node resource reporting"))
   219  				case ctx.Err() == nil:
   220  					// This is a problem, report it and retry.
   221  					logger.Error(err, "Reading node resources from gRPC stream failed")
   222  					// TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
   223  					select {
   224  					case <-time.After(5 * time.Second):
   225  					case <-ctx.Done():
   226  					}
   227  				}
   228  				break
   229  			}
   230  
   231  			if loggerV := logger.V(6); loggerV.Enabled() {
   232  				loggerV.Info("Driver resources updated", "resources", response.Resources)
   233  			} else {
   234  				logger.V(5).Info("Driver resources updated", "numResources", len(response.Resources))
   235  			}
   236  
   237  			c.mutex.Lock()
   238  			active.resources = response.Resources
   239  			c.mutex.Unlock()
   240  			c.queue.Add(driverName)
   241  		}
   242  	}
   243  }
   244  
   245  // run is running in the background. It handles blocking initialization (like
   246  // syncing the informer) and then syncs the actual with the desired state.
   247  func (c *nodeResourcesController) run(ctx context.Context) {
   248  	logger := klog.FromContext(ctx)
   249  
   250  	// When kubelet starts, we have two choices:
   251  	// - Sync immediately, which in practice will delete all ResourceSlices
   252  	//   because no plugin has registered yet. We could do a DeleteCollection
   253  	//   to speed this up.
   254  	// - Wait a bit, then sync. If all plugins have re-registered in the meantime,
   255  	//   we might not need to change any ResourceSlice.
   256  	//
   257  	// For now syncing starts immediately, with no DeleteCollection. This
   258  	// can be reconsidered later.
   259  
   260  	// Wait until we're able to get a Node object.
   261  	// This means that the object is created on the API server,
   262  	// the kubeclient is functional and the node informer cache is populated with the node object.
   263  	// Without this it doesn't make sense to proceed further as we need a node name and
   264  	// a node UID for this controller to work.
   265  	var node *v1.Node
   266  	var err error
   267  	for {
   268  		node, err = c.getNode()
   269  		if err == nil {
   270  			break
   271  		}
   272  		logger.V(5).Info("Getting Node object failed, waiting", "err", err)
   273  		select {
   274  		case <-ctx.Done():
   275  			return
   276  		case <-time.After(time.Second):
   277  		}
   278  	}
   279  
   280  	// We could use an indexer on driver name, but that seems overkill.
   281  	informer := resourceinformers.NewFilteredResourceSliceInformer(c.kubeClient, resyncPeriod, nil, func(options *metav1.ListOptions) {
   282  		options.FieldSelector = "nodeName=" + node.Name
   283  	})
   284  	c.sliceStore = informer.GetStore()
   285  	handler, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
   286  		AddFunc: func(obj any) {
   287  			slice, ok := obj.(*resourceapi.ResourceSlice)
   288  			if !ok {
   289  				return
   290  			}
   291  			logger.V(5).Info("ResourceSlice add", "slice", klog.KObj(slice))
   292  			c.queue.Add(slice.DriverName)
   293  		},
   294  		UpdateFunc: func(old, new any) {
   295  			oldSlice, ok := old.(*resourceapi.ResourceSlice)
   296  			if !ok {
   297  				return
   298  			}
   299  			newSlice, ok := new.(*resourceapi.ResourceSlice)
   300  			if !ok {
   301  				return
   302  			}
   303  			if loggerV := logger.V(6); loggerV.Enabled() {
   304  				loggerV.Info("ResourceSlice update", "slice", klog.KObj(newSlice), "diff", cmp.Diff(oldSlice, newSlice))
   305  			} else {
   306  				logger.V(5).Info("ResourceSlice update", "slice", klog.KObj(newSlice))
   307  			}
   308  			c.queue.Add(newSlice.DriverName)
   309  		},
   310  		DeleteFunc: func(obj any) {
   311  			if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok {
   312  				obj = tombstone.Obj
   313  			}
   314  			slice, ok := obj.(*resourceapi.ResourceSlice)
   315  			if !ok {
   316  				return
   317  			}
   318  			logger.V(5).Info("ResourceSlice delete", "slice", klog.KObj(slice))
   319  			c.queue.Add(slice.DriverName)
   320  		},
   321  	})
   322  	if err != nil {
   323  		logger.Error(err, "Registering event handler on the ResourceSlice informer failed, disabling resource monitoring")
   324  		return
   325  	}
   326  
   327  	// Start informer and wait for our cache to be populated.
   328  	c.wg.Add(1)
   329  	go func() {
   330  		defer c.wg.Done()
   331  		informer.Run(ctx.Done())
   332  	}()
   333  	for !handler.HasSynced() {
   334  		select {
   335  		case <-time.After(time.Second):
   336  		case <-ctx.Done():
   337  			return
   338  		}
   339  	}
   340  	logger.Info("ResourceSlice informer has synced")
   341  
   342  	for c.processNextWorkItem(ctx) {
   343  	}
   344  }
   345  
   346  func (c *nodeResourcesController) processNextWorkItem(ctx context.Context) bool {
   347  	key, shutdown := c.queue.Get()
   348  	if shutdown {
   349  		return false
   350  	}
   351  	defer c.queue.Done(key)
   352  
   353  	driverName := key
   354  
   355  	// Panics are caught and treated like errors.
   356  	var err error
   357  	func() {
   358  		defer func() {
   359  			if r := recover(); r != nil {
   360  				err = fmt.Errorf("internal error: %v", r)
   361  			}
   362  		}()
   363  		err = c.sync(ctx, driverName)
   364  	}()
   365  
   366  	if err != nil {
   367  		// TODO (https://github.com/kubernetes/enhancements/issues/3077): contextual logging in utilruntime
   368  		utilruntime.HandleError(fmt.Errorf("processing driver %v: %v", driverName, err))
   369  		c.queue.AddRateLimited(key)
   370  
   371  		// Return without removing the work item from the queue.
   372  		// It will be retried.
   373  		return true
   374  	}
   375  
   376  	c.queue.Forget(key)
   377  	return true
   378  }
   379  
   380  func (c *nodeResourcesController) sync(ctx context.Context, driverName string) error {
   381  	logger := klog.FromContext(ctx)
   382  
   383  	// Gather information about the actual and desired state.
   384  	slices := c.sliceStore.List()
   385  	var driverResources []*resourceapi.ResourceModel
   386  	c.mutex.RLock()
   387  	if active, ok := c.activePlugins[driverName]; ok {
   388  		// No need for a deep copy, the entire slice gets replaced on writes.
   389  		driverResources = active.resources
   390  	}
   391  	c.mutex.RUnlock()
   392  
   393  	// Resources that are not yet stored in any slice need to be published.
   394  	// Here we track the indices of any resources that are already stored.
   395  	storedResourceIndices := sets.New[int]()
   396  
   397  	// Slices that don't match any driver resource can either be updated (if there
   398  	// are new driver resources that need to be stored) or they need to be deleted.
   399  	obsoleteSlices := make([]*resourceapi.ResourceSlice, 0, len(slices))
   400  
   401  	// Match slices with resource information.
   402  	for _, obj := range slices {
   403  		slice := obj.(*resourceapi.ResourceSlice)
   404  		if slice.DriverName != driverName {
   405  			continue
   406  		}
   407  
   408  		index := indexOfModel(driverResources, &slice.ResourceModel)
   409  		if index >= 0 {
   410  			storedResourceIndices.Insert(index)
   411  			continue
   412  		}
   413  
   414  		obsoleteSlices = append(obsoleteSlices, slice)
   415  	}
   416  
   417  	if loggerV := logger.V(6); loggerV.Enabled() {
   418  		// Dump entire resource information.
   419  		loggerV.Info("Syncing existing driver node resource slices with driver resources", "slices", klog.KObjSlice(slices), "resources", driverResources)
   420  	} else {
   421  		logger.V(5).Info("Syncing existing driver node resource slices with driver resources", "slices", klog.KObjSlice(slices), "numResources", len(driverResources))
   422  	}
   423  
   424  	// Update stale slices before removing what's left.
   425  	//
   426  	// We don't really know which of these slices might have
   427  	// been used for "the" driver resource because they don't
   428  	// have a unique ID. In practice, a driver is most likely
   429  	// to just give us one ResourceModel, in which case
   430  	// this isn't a problem at all. If we have more than one,
   431  	// then at least conceptually it currently doesn't matter
   432  	// where we publish it.
   433  	//
   434  	// The long-term goal is to move the handling of
   435  	// ResourceSlice objects into the driver, with kubelet
   436  	// just acting as a REST proxy. The advantage of that will
   437  	// be that kubelet won't need to support the same
   438  	// resource API version as the driver and the control plane.
   439  	// With that approach, the driver will be able to match
   440  	// up objects more intelligently.
   441  	numObsoleteSlices := len(obsoleteSlices)
   442  	for index, resource := range driverResources {
   443  		if storedResourceIndices.Has(index) {
   444  			// No need to do anything, it is already stored exactly
   445  			// like this in an existing slice.
   446  			continue
   447  		}
   448  
   449  		if numObsoleteSlices > 0 {
   450  			// Update one existing slice.
   451  			slice := obsoleteSlices[numObsoleteSlices-1]
   452  			numObsoleteSlices--
   453  			slice = slice.DeepCopy()
   454  			slice.ResourceModel = *resource
   455  			logger.V(5).Info("Reusing existing node resource slice", "slice", klog.KObj(slice))
   456  			if _, err := c.kubeClient.ResourceV1alpha2().ResourceSlices().Update(ctx, slice, metav1.UpdateOptions{}); err != nil {
   457  				return fmt.Errorf("update node resource slice: %w", err)
   458  			}
   459  			continue
   460  		}
   461  
   462  		// Although node name and UID are unlikely to change
   463  		// we're getting updated node object just to be on the safe side.
   464  		// It's a cheap operation as it gets an object from the node informer cache.
   465  		node, err := c.getNode()
   466  		if err != nil {
   467  			return fmt.Errorf("retrieve node object: %w", err)
   468  		}
   469  
   470  		// Create a new slice.
   471  		slice := &resourceapi.ResourceSlice{
   472  			ObjectMeta: metav1.ObjectMeta{
   473  				GenerateName: node.Name + "-" + driverName + "-",
   474  				OwnerReferences: []metav1.OwnerReference{
   475  					{
   476  						APIVersion: v1.SchemeGroupVersion.WithKind("Node").Version,
   477  						Kind:       v1.SchemeGroupVersion.WithKind("Node").Kind,
   478  						Name:       node.Name,
   479  						UID:        node.UID,
   480  						Controller: ptr.To(true),
   481  					},
   482  				},
   483  			},
   484  			NodeName:      node.Name,
   485  			DriverName:    driverName,
   486  			ResourceModel: *resource,
   487  		}
   488  		logger.V(5).Info("Creating new node resource slice", "slice", klog.KObj(slice))
   489  		if _, err := c.kubeClient.ResourceV1alpha2().ResourceSlices().Create(ctx, slice, metav1.CreateOptions{}); err != nil {
   490  			return fmt.Errorf("create node resource slice: %w", err)
   491  		}
   492  	}
   493  
   494  	// All remaining slices are truly orphaned.
   495  	for i := 0; i < numObsoleteSlices; i++ {
   496  		slice := obsoleteSlices[i]
   497  		logger.V(5).Info("Deleting obsolete node resource slice", "slice", klog.KObj(slice))
   498  		if err := c.kubeClient.ResourceV1alpha2().ResourceSlices().Delete(ctx, slice.Name, metav1.DeleteOptions{}); err != nil {
   499  			return fmt.Errorf("delete node resource slice: %w", err)
   500  		}
   501  	}
   502  
   503  	return nil
   504  }
   505  
   506  func indexOfModel(models []*resourceapi.ResourceModel, model *resourceapi.ResourceModel) int {
   507  	for index, m := range models {
   508  		if apiequality.Semantic.DeepEqual(m, model) {
   509  			return index
   510  		}
   511  	}
   512  	return -1
   513  }