k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/dra/test-driver/app/kubeletplugin.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package app 18 19 import ( 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "os" 25 "path/filepath" 26 "sync" 27 28 "github.com/google/go-cmp/cmp" 29 "google.golang.org/grpc" 30 "google.golang.org/grpc/codes" 31 "google.golang.org/grpc/status" 32 33 resourceapi "k8s.io/api/resource/v1alpha2" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/util/sets" 36 "k8s.io/dynamic-resource-allocation/kubeletplugin" 37 "k8s.io/klog/v2" 38 drapbv1alpha3 "k8s.io/kubelet/pkg/apis/dra/v1alpha3" 39 ) 40 41 type ExamplePlugin struct { 42 stopCh <-chan struct{} 43 logger klog.Logger 44 d kubeletplugin.DRAPlugin 45 fileOps FileOperations 46 47 cdiDir string 48 driverName string 49 nodeName string 50 instances sets.Set[string] 51 52 mutex sync.Mutex 53 instancesInUse sets.Set[string] 54 prepared map[ClaimID]any 55 gRPCCalls []GRPCCall 56 57 block bool 58 } 59 60 type GRPCCall struct { 61 // FullMethod is the fully qualified, e.g. /package.service/method. 62 FullMethod string 63 64 // Request contains the parameters of the call. 65 Request interface{} 66 67 // Response contains the reply of the plugin. It is nil for calls that are in progress. 68 Response interface{} 69 70 // Err contains the error return value of the plugin. It is nil for calls that are in progress or succeeded. 71 Err error 72 } 73 74 // ClaimID contains both claim name and UID to simplify debugging. The 75 // namespace is not included because it is random in E2E tests and the UID is 76 // sufficient to make the ClaimID unique. 77 type ClaimID struct { 78 Name string 79 UID string 80 } 81 82 var _ drapbv1alpha3.NodeServer = &ExamplePlugin{} 83 84 // getJSONFilePath returns the absolute path where CDI file is/should be. 85 func (ex *ExamplePlugin) getJSONFilePath(claimUID string) string { 86 return filepath.Join(ex.cdiDir, fmt.Sprintf("%s-%s.json", ex.driverName, claimUID)) 87 } 88 89 // FileOperations defines optional callbacks for handling CDI files 90 // and some other configuration. 91 type FileOperations struct { 92 // Create must overwrite the file. 93 Create func(name string, content []byte) error 94 95 // Remove must remove the file. It must not return an error when the 96 // file does not exist. 97 Remove func(name string) error 98 99 // NumResourceInstances determines whether the plugin reports resources 100 // instances and how many. A negative value causes it to report "not implemented" 101 // in the NodeListAndWatchResources gRPC call. 102 NumResourceInstances int 103 } 104 105 // StartPlugin sets up the servers that are necessary for a DRA kubelet plugin. 106 func StartPlugin(ctx context.Context, cdiDir, driverName string, nodeName string, fileOps FileOperations, opts ...kubeletplugin.Option) (*ExamplePlugin, error) { 107 logger := klog.FromContext(ctx) 108 if fileOps.Create == nil { 109 fileOps.Create = func(name string, content []byte) error { 110 return os.WriteFile(name, content, os.FileMode(0644)) 111 } 112 } 113 if fileOps.Remove == nil { 114 fileOps.Remove = func(name string) error { 115 if err := os.Remove(name); err != nil && !os.IsNotExist(err) { 116 return err 117 } 118 return nil 119 } 120 } 121 ex := &ExamplePlugin{ 122 stopCh: ctx.Done(), 123 logger: logger, 124 fileOps: fileOps, 125 cdiDir: cdiDir, 126 driverName: driverName, 127 nodeName: nodeName, 128 instances: sets.New[string](), 129 instancesInUse: sets.New[string](), 130 prepared: make(map[ClaimID]any), 131 } 132 133 for i := 0; i < ex.fileOps.NumResourceInstances; i++ { 134 ex.instances.Insert(fmt.Sprintf("instance-%02d", i)) 135 } 136 137 opts = append(opts, 138 kubeletplugin.Logger(logger), 139 kubeletplugin.DriverName(driverName), 140 kubeletplugin.GRPCInterceptor(ex.recordGRPCCall), 141 kubeletplugin.GRPCStreamInterceptor(ex.recordGRPCStream), 142 ) 143 d, err := kubeletplugin.Start(ex, opts...) 144 if err != nil { 145 return nil, fmt.Errorf("start kubelet plugin: %w", err) 146 } 147 ex.d = d 148 149 return ex, nil 150 } 151 152 // stop ensures that all servers are stopped and resources freed. 153 func (ex *ExamplePlugin) Stop() { 154 ex.d.Stop() 155 } 156 157 func (ex *ExamplePlugin) IsRegistered() bool { 158 status := ex.d.RegistrationStatus() 159 if status == nil { 160 return false 161 } 162 return status.PluginRegistered 163 } 164 165 // Block sets a flag to block Node[Un]PrepareResources 166 // to emulate time consuming or stuck calls 167 func (ex *ExamplePlugin) Block() { 168 ex.block = true 169 } 170 171 // NodePrepareResource ensures that the CDI file for the claim exists. It uses 172 // a deterministic name to simplify NodeUnprepareResource (no need to remember 173 // or discover the name) and idempotency (when called again, the file simply 174 // gets written again). 175 func (ex *ExamplePlugin) nodePrepareResource(ctx context.Context, claimName string, claimUID string, resourceHandle string, structuredResourceHandle []*resourceapi.StructuredResourceHandle) ([]string, error) { 176 logger := klog.FromContext(ctx) 177 178 // Block to emulate plugin stuckness or slowness. 179 // By default the call will not be blocked as ex.block = false. 180 if ex.block { 181 <-ctx.Done() 182 return nil, ctx.Err() 183 } 184 185 ex.mutex.Lock() 186 defer ex.mutex.Unlock() 187 188 deviceName := "claim-" + claimUID 189 vendor := ex.driverName 190 class := "test" 191 dev := vendor + "/" + class + "=" + deviceName 192 claimID := ClaimID{Name: claimName, UID: claimUID} 193 if _, ok := ex.prepared[claimID]; ok { 194 // Idempotent call, nothing to do. 195 return []string{dev}, nil 196 } 197 198 // Determine environment variables. 199 var p parameters 200 var actualResourceHandle any 201 var instanceNames []string 202 switch len(structuredResourceHandle) { 203 case 0: 204 // Control plane controller did the allocation. 205 if err := json.Unmarshal([]byte(resourceHandle), &p); err != nil { 206 return nil, fmt.Errorf("unmarshal resource handle: %w", err) 207 } 208 actualResourceHandle = resourceHandle 209 case 1: 210 // Scheduler did the allocation with structured parameters. 211 handle := structuredResourceHandle[0] 212 if handle == nil { 213 return nil, errors.New("unexpected nil StructuredResourceHandle") 214 } 215 p.NodeName = handle.NodeName 216 if err := extractParameters(handle.VendorClassParameters, &p.EnvVars, "admin"); err != nil { 217 return nil, err 218 } 219 if err := extractParameters(handle.VendorClaimParameters, &p.EnvVars, "user"); err != nil { 220 return nil, err 221 } 222 for _, result := range handle.Results { 223 if err := extractParameters(result.VendorRequestParameters, &p.EnvVars, "user"); err != nil { 224 return nil, err 225 } 226 namedResources := result.NamedResources 227 if namedResources == nil { 228 return nil, errors.New("missing named resources allocation result") 229 } 230 instanceName := namedResources.Name 231 if instanceName == "" { 232 return nil, errors.New("empty named resources instance name") 233 } 234 if !ex.instances.Has(instanceName) { 235 return nil, fmt.Errorf("unknown allocated instance %q", instanceName) 236 } 237 if ex.instancesInUse.Has(instanceName) { 238 return nil, fmt.Errorf("resource instance %q used more than once", instanceName) 239 } 240 instanceNames = append(instanceNames, instanceName) 241 } 242 actualResourceHandle = handle 243 default: 244 // Huh? 245 return nil, fmt.Errorf("invalid length of NodePrepareResourceRequest.StructuredResourceHandle: %d", len(structuredResourceHandle)) 246 } 247 248 // Sanity check scheduling. 249 if p.NodeName != "" && ex.nodeName != "" && p.NodeName != ex.nodeName { 250 return nil, fmt.Errorf("claim was allocated for %q, cannot be prepared on %q", p.NodeName, ex.nodeName) 251 } 252 253 // CDI wants env variables as set of strings. 254 envs := []string{} 255 for key, val := range p.EnvVars { 256 envs = append(envs, key+"="+val) 257 } 258 259 spec := &spec{ 260 Version: "0.3.0", // This has to be a version accepted by the runtimes. 261 Kind: vendor + "/" + class, 262 // At least one device is required and its entry must have more 263 // than just the name. 264 Devices: []device{ 265 { 266 Name: deviceName, 267 ContainerEdits: containerEdits{ 268 Env: envs, 269 }, 270 }, 271 }, 272 } 273 filePath := ex.getJSONFilePath(claimUID) 274 buffer, err := json.Marshal(spec) 275 if err != nil { 276 return nil, fmt.Errorf("marshal spec: %w", err) 277 } 278 if err := ex.fileOps.Create(filePath, buffer); err != nil { 279 return nil, fmt.Errorf("failed to write CDI file %v", err) 280 } 281 282 ex.prepared[claimID] = actualResourceHandle 283 for _, instanceName := range instanceNames { 284 ex.instancesInUse.Insert(instanceName) 285 } 286 287 logger.V(3).Info("CDI file created", "path", filePath, "device", dev) 288 return []string{dev}, nil 289 } 290 291 func extractParameters(parameters runtime.RawExtension, env *map[string]string, kind string) error { 292 if len(parameters.Raw) == 0 { 293 return nil 294 } 295 var data map[string]string 296 if err := json.Unmarshal(parameters.Raw, &data); err != nil { 297 return fmt.Errorf("decoding %s parameters: %v", kind, err) 298 } 299 if len(data) > 0 && *env == nil { 300 *env = make(map[string]string) 301 } 302 for key, value := range data { 303 (*env)[kind+"_"+key] = value 304 } 305 return nil 306 } 307 308 func (ex *ExamplePlugin) NodePrepareResources(ctx context.Context, req *drapbv1alpha3.NodePrepareResourcesRequest) (*drapbv1alpha3.NodePrepareResourcesResponse, error) { 309 resp := &drapbv1alpha3.NodePrepareResourcesResponse{ 310 Claims: make(map[string]*drapbv1alpha3.NodePrepareResourceResponse), 311 } 312 for _, claimReq := range req.Claims { 313 cdiDevices, err := ex.nodePrepareResource(ctx, claimReq.Name, claimReq.Uid, claimReq.ResourceHandle, claimReq.StructuredResourceHandle) 314 if err != nil { 315 resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodePrepareResourceResponse{ 316 Error: err.Error(), 317 } 318 } else { 319 resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodePrepareResourceResponse{ 320 CDIDevices: cdiDevices, 321 } 322 } 323 } 324 return resp, nil 325 } 326 327 // NodeUnprepareResource removes the CDI file created by 328 // NodePrepareResource. It's idempotent, therefore it is not an error when that 329 // file is already gone. 330 func (ex *ExamplePlugin) nodeUnprepareResource(ctx context.Context, claimName string, claimUID string, resourceHandle string, structuredResourceHandle []*resourceapi.StructuredResourceHandle) error { 331 logger := klog.FromContext(ctx) 332 333 // Block to emulate plugin stuckness or slowness. 334 // By default the call will not be blocked as ex.block = false. 335 if ex.block { 336 <-ctx.Done() 337 return ctx.Err() 338 } 339 340 filePath := ex.getJSONFilePath(claimUID) 341 if err := ex.fileOps.Remove(filePath); err != nil { 342 return fmt.Errorf("error removing CDI file: %w", err) 343 } 344 logger.V(3).Info("CDI file removed", "path", filePath) 345 346 ex.mutex.Lock() 347 defer ex.mutex.Unlock() 348 349 claimID := ClaimID{Name: claimName, UID: claimUID} 350 expectedResourceHandle, ok := ex.prepared[claimID] 351 if !ok { 352 // Idempotent call, nothing to do. 353 return nil 354 } 355 356 var actualResourceHandle any = resourceHandle 357 if structuredResourceHandle != nil { 358 if len(structuredResourceHandle) != 1 { 359 return fmt.Errorf("unexpected number of entries in StructuredResourceHandle: %d", len(structuredResourceHandle)) 360 } 361 actualResourceHandle = structuredResourceHandle[0] 362 } 363 if diff := cmp.Diff(expectedResourceHandle, actualResourceHandle); diff != "" { 364 return fmt.Errorf("difference between expected (-) and actual resource handle (+):\n%s", diff) 365 } 366 delete(ex.prepared, claimID) 367 if structuredResourceHandle := structuredResourceHandle; structuredResourceHandle != nil { 368 for _, handle := range structuredResourceHandle { 369 for _, result := range handle.Results { 370 instanceName := result.NamedResources.Name 371 ex.instancesInUse.Delete(instanceName) 372 } 373 } 374 } 375 delete(ex.prepared, ClaimID{Name: claimName, UID: claimUID}) 376 377 return nil 378 } 379 380 func (ex *ExamplePlugin) NodeUnprepareResources(ctx context.Context, req *drapbv1alpha3.NodeUnprepareResourcesRequest) (*drapbv1alpha3.NodeUnprepareResourcesResponse, error) { 381 resp := &drapbv1alpha3.NodeUnprepareResourcesResponse{ 382 Claims: make(map[string]*drapbv1alpha3.NodeUnprepareResourceResponse), 383 } 384 for _, claimReq := range req.Claims { 385 err := ex.nodeUnprepareResource(ctx, claimReq.Name, claimReq.Uid, claimReq.ResourceHandle, claimReq.StructuredResourceHandle) 386 if err != nil { 387 resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodeUnprepareResourceResponse{ 388 Error: err.Error(), 389 } 390 } else { 391 resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodeUnprepareResourceResponse{} 392 } 393 } 394 return resp, nil 395 } 396 397 func (ex *ExamplePlugin) NodeListAndWatchResources(req *drapbv1alpha3.NodeListAndWatchResourcesRequest, stream drapbv1alpha3.Node_NodeListAndWatchResourcesServer) error { 398 if ex.fileOps.NumResourceInstances < 0 { 399 ex.logger.Info("Sending no NodeResourcesResponse") 400 return status.New(codes.Unimplemented, "node resource support disabled").Err() 401 } 402 403 instances := make([]resourceapi.NamedResourcesInstance, len(ex.instances)) 404 for i, name := range sets.List(ex.instances) { 405 instances[i].Name = name 406 } 407 resp := &drapbv1alpha3.NodeListAndWatchResourcesResponse{ 408 Resources: []*resourceapi.ResourceModel{ 409 { 410 NamedResources: &resourceapi.NamedResourcesResources{ 411 Instances: instances, 412 }, 413 }, 414 }, 415 } 416 417 ex.logger.Info("Sending NodeListAndWatchResourcesResponse", "response", resp) 418 if err := stream.Send(resp); err != nil { 419 return err 420 } 421 422 // Keep the stream open until the test is done. 423 // TODO: test sending more updates later 424 <-ex.stopCh 425 ex.logger.Info("Done sending NodeListAndWatchResourcesResponse, closing stream") 426 427 return nil 428 } 429 430 func (ex *ExamplePlugin) GetPreparedResources() []ClaimID { 431 ex.mutex.Lock() 432 defer ex.mutex.Unlock() 433 var prepared []ClaimID 434 for claimID := range ex.prepared { 435 prepared = append(prepared, claimID) 436 } 437 return prepared 438 } 439 440 func (ex *ExamplePlugin) recordGRPCCall(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) { 441 call := GRPCCall{ 442 FullMethod: info.FullMethod, 443 Request: req, 444 } 445 ex.mutex.Lock() 446 ex.gRPCCalls = append(ex.gRPCCalls, call) 447 index := len(ex.gRPCCalls) - 1 448 ex.mutex.Unlock() 449 450 // We don't hold the mutex here to allow concurrent calls. 451 call.Response, call.Err = handler(ctx, req) 452 453 ex.mutex.Lock() 454 ex.gRPCCalls[index] = call 455 ex.mutex.Unlock() 456 457 return call.Response, call.Err 458 } 459 460 func (ex *ExamplePlugin) recordGRPCStream(srv interface{}, stream grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { 461 call := GRPCCall{ 462 FullMethod: info.FullMethod, 463 } 464 ex.mutex.Lock() 465 ex.gRPCCalls = append(ex.gRPCCalls, call) 466 index := len(ex.gRPCCalls) - 1 467 ex.mutex.Unlock() 468 469 // We don't hold the mutex here to allow concurrent calls. 470 call.Err = handler(srv, stream) 471 472 ex.mutex.Lock() 473 ex.gRPCCalls[index] = call 474 ex.mutex.Unlock() 475 476 return call.Err 477 } 478 479 func (ex *ExamplePlugin) GetGRPCCalls() []GRPCCall { 480 ex.mutex.Lock() 481 defer ex.mutex.Unlock() 482 483 // We must return a new slice, otherwise adding new calls would become 484 // visible to the caller. We also need to copy the entries because 485 // they get mutated by recordGRPCCall. 486 calls := make([]GRPCCall, 0, len(ex.gRPCCalls)) 487 calls = append(calls, ex.gRPCCalls...) 488 return calls 489 }