k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/cm/dra/plugin/noderesources.go (about) 1 /* 2 Copyright 2024 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package plugin 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "io" 24 "sync" 25 "time" 26 27 "github.com/google/go-cmp/cmp" 28 "google.golang.org/grpc/codes" 29 "google.golang.org/grpc/status" 30 31 v1 "k8s.io/api/core/v1" 32 resourceapi "k8s.io/api/resource/v1alpha2" 33 apiequality "k8s.io/apimachinery/pkg/api/equality" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 36 "k8s.io/apimachinery/pkg/util/sets" 37 resourceinformers "k8s.io/client-go/informers/resource/v1alpha2" 38 "k8s.io/client-go/kubernetes" 39 "k8s.io/client-go/tools/cache" 40 "k8s.io/client-go/util/workqueue" 41 "k8s.io/klog/v2" 42 drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3" 43 "k8s.io/utils/ptr" 44 ) 45 46 const ( 47 // resyncPeriod for informer 48 // TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable? 49 resyncPeriod = time.Duration(10 * time.Minute) 50 ) 51 52 // nodeResourcesController collects resource information from all registered 53 // plugins and synchronizes that information with ResourceSlice objects. 54 type nodeResourcesController struct { 55 ctx context.Context 56 kubeClient kubernetes.Interface 57 getNode func() (*v1.Node, error) 58 wg sync.WaitGroup 59 queue workqueue.TypedRateLimitingInterface[string] 60 sliceStore cache.Store 61 62 mutex sync.RWMutex 63 activePlugins map[string]*activePlugin 64 } 65 66 // activePlugin holds the resource information about one plugin 67 // and the gRPC stream that is used to retrieve that. The context 68 // used by that stream can be canceled separately to stop 69 // the monitoring. 70 type activePlugin struct { 71 // cancel is the function which cancels the monitorPlugin goroutine 72 // for this plugin. 73 cancel func(reason error) 74 75 // resources is protected by the nodeResourcesController read/write lock. 76 // When receiving updates from the driver, the entire slice gets replaced, 77 // so it is okay to not do a deep copy of it. Only retrieving the slice 78 // must be protected by a read lock. 79 resources []*resourceapi.ResourceModel 80 } 81 82 // startNodeResourcesController constructs a new controller and starts it. 83 // 84 // If a kubeClient is provided, then it synchronizes ResourceSlices 85 // with the resource information provided by plugins. Without it, 86 // the controller is inactive. This can happen when kubelet is run stand-alone 87 // without an apiserver. In that case we can't and don't need to publish 88 // ResourceSlices. 89 func startNodeResourcesController(ctx context.Context, kubeClient kubernetes.Interface, getNode func() (*v1.Node, error)) *nodeResourcesController { 90 if kubeClient == nil { 91 return nil 92 } 93 94 logger := klog.FromContext(ctx) 95 logger = klog.LoggerWithName(logger, "node resources controller") 96 ctx = klog.NewContext(ctx, logger) 97 98 c := &nodeResourcesController{ 99 ctx: ctx, 100 kubeClient: kubeClient, 101 getNode: getNode, 102 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 103 workqueue.DefaultTypedControllerRateLimiter[string](), 104 workqueue.TypedRateLimitingQueueConfig[string]{Name: "node_resource_slices"}, 105 ), 106 activePlugins: make(map[string]*activePlugin), 107 } 108 109 c.wg.Add(1) 110 go func() { 111 defer c.wg.Done() 112 c.run(ctx) 113 }() 114 115 return c 116 } 117 118 // waitForStop blocks until all background activity spawned by 119 // the controller has stopped. The context passed to start must 120 // be canceled for that to happen. 121 // 122 // Not needed at the moment, but if it was, this is what it would 123 // look like... 124 // func (c *nodeResourcesController) waitForStop() { 125 // if c == nil { 126 // return 127 // } 128 // 129 // c.wg.Wait() 130 // } 131 132 // addPlugin is called whenever a plugin has been (re-)registered. 133 func (c *nodeResourcesController) addPlugin(driverName string, pluginInstance *plugin) { 134 if c == nil { 135 return 136 } 137 138 klog.FromContext(c.ctx).V(2).Info("Adding plugin", "driverName", driverName) 139 c.mutex.Lock() 140 defer c.mutex.Unlock() 141 142 if active := c.activePlugins[driverName]; active != nil { 143 active.cancel(errors.New("plugin has re-registered")) 144 } 145 active := &activePlugin{} 146 cancelCtx, cancel := context.WithCancelCause(c.ctx) 147 active.cancel = cancel 148 c.activePlugins[driverName] = active 149 c.queue.Add(driverName) 150 151 c.wg.Add(1) 152 go func() { 153 defer c.wg.Done() 154 c.monitorPlugin(cancelCtx, active, driverName, pluginInstance) 155 }() 156 } 157 158 // removePlugin is called whenever a plugin has been unregistered. 159 func (c *nodeResourcesController) removePlugin(driverName string) { 160 if c == nil { 161 return 162 } 163 164 klog.FromContext(c.ctx).V(2).Info("Removing plugin", "driverName", driverName) 165 c.mutex.Lock() 166 defer c.mutex.Unlock() 167 if active, ok := c.activePlugins[driverName]; ok { 168 active.cancel(errors.New("plugin has unregistered")) 169 delete(c.activePlugins, driverName) 170 c.queue.Add(driverName) 171 } 172 } 173 174 // monitorPlugin calls the plugin to retrieve resource information and caches 175 // all responses that it gets for processing in the sync method. It keeps 176 // retrying until an error or EOF response indicates that no further data is 177 // going to be sent, then watch resources of the plugin stops until it 178 // re-registers. 179 func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *activePlugin, driverName string, pluginInstance *plugin) { 180 logger := klog.FromContext(ctx) 181 logger = klog.LoggerWithValues(logger, "driverName", driverName) 182 logger.Info("Starting to monitor node resources of the plugin") 183 defer func() { 184 r := recover() 185 logger.Info("Stopping to monitor node resources of the plugin", "reason", context.Cause(ctx), "err", ctx.Err(), "recover", r) 186 }() 187 188 // Keep trying until canceled. 189 for ctx.Err() == nil { 190 logger.V(5).Info("Calling NodeListAndWatchResources") 191 stream, err := pluginInstance.NodeListAndWatchResources(ctx, new(drapb.NodeListAndWatchResourcesRequest)) 192 if err != nil { 193 switch { 194 case status.Convert(err).Code() == codes.Unimplemented: 195 // The plugin simply doesn't provide node resources. 196 active.cancel(errors.New("plugin does not support node resource reporting")) 197 default: 198 // This is a problem, report it and retry. 199 logger.Error(err, "Creating gRPC stream for node resources failed") 200 // TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff? 201 select { 202 case <-time.After(5 * time.Second): 203 case <-ctx.Done(): 204 } 205 } 206 continue 207 } 208 for { 209 response, err := stream.Recv() 210 if err != nil { 211 switch { 212 case errors.Is(err, io.EOF): 213 // This is okay. Some plugins might never change their 214 // resources after reporting them once. 215 active.cancel(errors.New("plugin has closed the stream")) 216 case status.Convert(err).Code() == codes.Unimplemented: 217 // The plugin has the method, does not really implement it. 218 active.cancel(errors.New("plugin does not support node resource reporting")) 219 case ctx.Err() == nil: 220 // This is a problem, report it and retry. 221 logger.Error(err, "Reading node resources from gRPC stream failed") 222 // TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff? 223 select { 224 case <-time.After(5 * time.Second): 225 case <-ctx.Done(): 226 } 227 } 228 break 229 } 230 231 if loggerV := logger.V(6); loggerV.Enabled() { 232 loggerV.Info("Driver resources updated", "resources", response.Resources) 233 } else { 234 logger.V(5).Info("Driver resources updated", "numResources", len(response.Resources)) 235 } 236 237 c.mutex.Lock() 238 active.resources = response.Resources 239 c.mutex.Unlock() 240 c.queue.Add(driverName) 241 } 242 } 243 } 244 245 // run is running in the background. It handles blocking initialization (like 246 // syncing the informer) and then syncs the actual with the desired state. 247 func (c *nodeResourcesController) run(ctx context.Context) { 248 logger := klog.FromContext(ctx) 249 250 // When kubelet starts, we have two choices: 251 // - Sync immediately, which in practice will delete all ResourceSlices 252 // because no plugin has registered yet. We could do a DeleteCollection 253 // to speed this up. 254 // - Wait a bit, then sync. If all plugins have re-registered in the meantime, 255 // we might not need to change any ResourceSlice. 256 // 257 // For now syncing starts immediately, with no DeleteCollection. This 258 // can be reconsidered later. 259 260 // Wait until we're able to get a Node object. 261 // This means that the object is created on the API server, 262 // the kubeclient is functional and the node informer cache is populated with the node object. 263 // Without this it doesn't make sense to proceed further as we need a node name and 264 // a node UID for this controller to work. 265 var node *v1.Node 266 var err error 267 for { 268 node, err = c.getNode() 269 if err == nil { 270 break 271 } 272 logger.V(5).Info("Getting Node object failed, waiting", "err", err) 273 select { 274 case <-ctx.Done(): 275 return 276 case <-time.After(time.Second): 277 } 278 } 279 280 // We could use an indexer on driver name, but that seems overkill. 281 informer := resourceinformers.NewFilteredResourceSliceInformer(c.kubeClient, resyncPeriod, nil, func(options *metav1.ListOptions) { 282 options.FieldSelector = "nodeName=" + node.Name 283 }) 284 c.sliceStore = informer.GetStore() 285 handler, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 286 AddFunc: func(obj any) { 287 slice, ok := obj.(*resourceapi.ResourceSlice) 288 if !ok { 289 return 290 } 291 logger.V(5).Info("ResourceSlice add", "slice", klog.KObj(slice)) 292 c.queue.Add(slice.DriverName) 293 }, 294 UpdateFunc: func(old, new any) { 295 oldSlice, ok := old.(*resourceapi.ResourceSlice) 296 if !ok { 297 return 298 } 299 newSlice, ok := new.(*resourceapi.ResourceSlice) 300 if !ok { 301 return 302 } 303 if loggerV := logger.V(6); loggerV.Enabled() { 304 loggerV.Info("ResourceSlice update", "slice", klog.KObj(newSlice), "diff", cmp.Diff(oldSlice, newSlice)) 305 } else { 306 logger.V(5).Info("ResourceSlice update", "slice", klog.KObj(newSlice)) 307 } 308 c.queue.Add(newSlice.DriverName) 309 }, 310 DeleteFunc: func(obj any) { 311 if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { 312 obj = tombstone.Obj 313 } 314 slice, ok := obj.(*resourceapi.ResourceSlice) 315 if !ok { 316 return 317 } 318 logger.V(5).Info("ResourceSlice delete", "slice", klog.KObj(slice)) 319 c.queue.Add(slice.DriverName) 320 }, 321 }) 322 if err != nil { 323 logger.Error(err, "Registering event handler on the ResourceSlice informer failed, disabling resource monitoring") 324 return 325 } 326 327 // Start informer and wait for our cache to be populated. 328 c.wg.Add(1) 329 go func() { 330 defer c.wg.Done() 331 informer.Run(ctx.Done()) 332 }() 333 for !handler.HasSynced() { 334 select { 335 case <-time.After(time.Second): 336 case <-ctx.Done(): 337 return 338 } 339 } 340 logger.Info("ResourceSlice informer has synced") 341 342 for c.processNextWorkItem(ctx) { 343 } 344 } 345 346 func (c *nodeResourcesController) processNextWorkItem(ctx context.Context) bool { 347 key, shutdown := c.queue.Get() 348 if shutdown { 349 return false 350 } 351 defer c.queue.Done(key) 352 353 driverName := key 354 355 // Panics are caught and treated like errors. 356 var err error 357 func() { 358 defer func() { 359 if r := recover(); r != nil { 360 err = fmt.Errorf("internal error: %v", r) 361 } 362 }() 363 err = c.sync(ctx, driverName) 364 }() 365 366 if err != nil { 367 // TODO (https://github.com/kubernetes/enhancements/issues/3077): contextual logging in utilruntime 368 utilruntime.HandleError(fmt.Errorf("processing driver %v: %v", driverName, err)) 369 c.queue.AddRateLimited(key) 370 371 // Return without removing the work item from the queue. 372 // It will be retried. 373 return true 374 } 375 376 c.queue.Forget(key) 377 return true 378 } 379 380 func (c *nodeResourcesController) sync(ctx context.Context, driverName string) error { 381 logger := klog.FromContext(ctx) 382 383 // Gather information about the actual and desired state. 384 slices := c.sliceStore.List() 385 var driverResources []*resourceapi.ResourceModel 386 c.mutex.RLock() 387 if active, ok := c.activePlugins[driverName]; ok { 388 // No need for a deep copy, the entire slice gets replaced on writes. 389 driverResources = active.resources 390 } 391 c.mutex.RUnlock() 392 393 // Resources that are not yet stored in any slice need to be published. 394 // Here we track the indices of any resources that are already stored. 395 storedResourceIndices := sets.New[int]() 396 397 // Slices that don't match any driver resource can either be updated (if there 398 // are new driver resources that need to be stored) or they need to be deleted. 399 obsoleteSlices := make([]*resourceapi.ResourceSlice, 0, len(slices)) 400 401 // Match slices with resource information. 402 for _, obj := range slices { 403 slice := obj.(*resourceapi.ResourceSlice) 404 if slice.DriverName != driverName { 405 continue 406 } 407 408 index := indexOfModel(driverResources, &slice.ResourceModel) 409 if index >= 0 { 410 storedResourceIndices.Insert(index) 411 continue 412 } 413 414 obsoleteSlices = append(obsoleteSlices, slice) 415 } 416 417 if loggerV := logger.V(6); loggerV.Enabled() { 418 // Dump entire resource information. 419 loggerV.Info("Syncing existing driver node resource slices with driver resources", "slices", klog.KObjSlice(slices), "resources", driverResources) 420 } else { 421 logger.V(5).Info("Syncing existing driver node resource slices with driver resources", "slices", klog.KObjSlice(slices), "numResources", len(driverResources)) 422 } 423 424 // Update stale slices before removing what's left. 425 // 426 // We don't really know which of these slices might have 427 // been used for "the" driver resource because they don't 428 // have a unique ID. In practice, a driver is most likely 429 // to just give us one ResourceModel, in which case 430 // this isn't a problem at all. If we have more than one, 431 // then at least conceptually it currently doesn't matter 432 // where we publish it. 433 // 434 // The long-term goal is to move the handling of 435 // ResourceSlice objects into the driver, with kubelet 436 // just acting as a REST proxy. The advantage of that will 437 // be that kubelet won't need to support the same 438 // resource API version as the driver and the control plane. 439 // With that approach, the driver will be able to match 440 // up objects more intelligently. 441 numObsoleteSlices := len(obsoleteSlices) 442 for index, resource := range driverResources { 443 if storedResourceIndices.Has(index) { 444 // No need to do anything, it is already stored exactly 445 // like this in an existing slice. 446 continue 447 } 448 449 if numObsoleteSlices > 0 { 450 // Update one existing slice. 451 slice := obsoleteSlices[numObsoleteSlices-1] 452 numObsoleteSlices-- 453 slice = slice.DeepCopy() 454 slice.ResourceModel = *resource 455 logger.V(5).Info("Reusing existing node resource slice", "slice", klog.KObj(slice)) 456 if _, err := c.kubeClient.ResourceV1alpha2().ResourceSlices().Update(ctx, slice, metav1.UpdateOptions{}); err != nil { 457 return fmt.Errorf("update node resource slice: %w", err) 458 } 459 continue 460 } 461 462 // Although node name and UID are unlikely to change 463 // we're getting updated node object just to be on the safe side. 464 // It's a cheap operation as it gets an object from the node informer cache. 465 node, err := c.getNode() 466 if err != nil { 467 return fmt.Errorf("retrieve node object: %w", err) 468 } 469 470 // Create a new slice. 471 slice := &resourceapi.ResourceSlice{ 472 ObjectMeta: metav1.ObjectMeta{ 473 GenerateName: node.Name + "-" + driverName + "-", 474 OwnerReferences: []metav1.OwnerReference{ 475 { 476 APIVersion: v1.SchemeGroupVersion.WithKind("Node").Version, 477 Kind: v1.SchemeGroupVersion.WithKind("Node").Kind, 478 Name: node.Name, 479 UID: node.UID, 480 Controller: ptr.To(true), 481 }, 482 }, 483 }, 484 NodeName: node.Name, 485 DriverName: driverName, 486 ResourceModel: *resource, 487 } 488 logger.V(5).Info("Creating new node resource slice", "slice", klog.KObj(slice)) 489 if _, err := c.kubeClient.ResourceV1alpha2().ResourceSlices().Create(ctx, slice, metav1.CreateOptions{}); err != nil { 490 return fmt.Errorf("create node resource slice: %w", err) 491 } 492 } 493 494 // All remaining slices are truly orphaned. 495 for i := 0; i < numObsoleteSlices; i++ { 496 slice := obsoleteSlices[i] 497 logger.V(5).Info("Deleting obsolete node resource slice", "slice", klog.KObj(slice)) 498 if err := c.kubeClient.ResourceV1alpha2().ResourceSlices().Delete(ctx, slice.Name, metav1.DeleteOptions{}); err != nil { 499 return fmt.Errorf("delete node resource slice: %w", err) 500 } 501 } 502 503 return nil 504 } 505 506 func indexOfModel(models []*resourceapi.ResourceModel, model *resourceapi.ResourceModel) int { 507 for index, m := range models { 508 if apiequality.Semantic.DeepEqual(m, model) { 509 return index 510 } 511 } 512 return -1 513 }