k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/dra/test-driver/app/kubeletplugin.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package app
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"fmt"
    24  	"os"
    25  	"path/filepath"
    26  	"sync"
    27  
    28  	"github.com/google/go-cmp/cmp"
    29  	"google.golang.org/grpc"
    30  	"google.golang.org/grpc/codes"
    31  	"google.golang.org/grpc/status"
    32  
    33  	resourceapi "k8s.io/api/resource/v1alpha2"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	"k8s.io/apimachinery/pkg/util/sets"
    36  	"k8s.io/dynamic-resource-allocation/kubeletplugin"
    37  	"k8s.io/klog/v2"
    38  	drapbv1alpha3 "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
    39  )
    40  
    41  type ExamplePlugin struct {
    42  	stopCh  <-chan struct{}
    43  	logger  klog.Logger
    44  	d       kubeletplugin.DRAPlugin
    45  	fileOps FileOperations
    46  
    47  	cdiDir     string
    48  	driverName string
    49  	nodeName   string
    50  	instances  sets.Set[string]
    51  
    52  	mutex          sync.Mutex
    53  	instancesInUse sets.Set[string]
    54  	prepared       map[ClaimID]any
    55  	gRPCCalls      []GRPCCall
    56  
    57  	block bool
    58  }
    59  
    60  type GRPCCall struct {
    61  	// FullMethod is the fully qualified, e.g. /package.service/method.
    62  	FullMethod string
    63  
    64  	// Request contains the parameters of the call.
    65  	Request interface{}
    66  
    67  	// Response contains the reply of the plugin. It is nil for calls that are in progress.
    68  	Response interface{}
    69  
    70  	// Err contains the error return value of the plugin. It is nil for calls that are in progress or succeeded.
    71  	Err error
    72  }
    73  
    74  // ClaimID contains both claim name and UID to simplify debugging. The
    75  // namespace is not included because it is random in E2E tests and the UID is
    76  // sufficient to make the ClaimID unique.
    77  type ClaimID struct {
    78  	Name string
    79  	UID  string
    80  }
    81  
    82  var _ drapbv1alpha3.NodeServer = &ExamplePlugin{}
    83  
    84  // getJSONFilePath returns the absolute path where CDI file is/should be.
    85  func (ex *ExamplePlugin) getJSONFilePath(claimUID string) string {
    86  	return filepath.Join(ex.cdiDir, fmt.Sprintf("%s-%s.json", ex.driverName, claimUID))
    87  }
    88  
    89  // FileOperations defines optional callbacks for handling CDI files
    90  // and some other configuration.
    91  type FileOperations struct {
    92  	// Create must overwrite the file.
    93  	Create func(name string, content []byte) error
    94  
    95  	// Remove must remove the file. It must not return an error when the
    96  	// file does not exist.
    97  	Remove func(name string) error
    98  
    99  	// NumResourceInstances determines whether the plugin reports resources
   100  	// instances and how many. A negative value causes it to report "not implemented"
   101  	// in the NodeListAndWatchResources gRPC call.
   102  	NumResourceInstances int
   103  }
   104  
   105  // StartPlugin sets up the servers that are necessary for a DRA kubelet plugin.
   106  func StartPlugin(ctx context.Context, cdiDir, driverName string, nodeName string, fileOps FileOperations, opts ...kubeletplugin.Option) (*ExamplePlugin, error) {
   107  	logger := klog.FromContext(ctx)
   108  	if fileOps.Create == nil {
   109  		fileOps.Create = func(name string, content []byte) error {
   110  			return os.WriteFile(name, content, os.FileMode(0644))
   111  		}
   112  	}
   113  	if fileOps.Remove == nil {
   114  		fileOps.Remove = func(name string) error {
   115  			if err := os.Remove(name); err != nil && !os.IsNotExist(err) {
   116  				return err
   117  			}
   118  			return nil
   119  		}
   120  	}
   121  	ex := &ExamplePlugin{
   122  		stopCh:         ctx.Done(),
   123  		logger:         logger,
   124  		fileOps:        fileOps,
   125  		cdiDir:         cdiDir,
   126  		driverName:     driverName,
   127  		nodeName:       nodeName,
   128  		instances:      sets.New[string](),
   129  		instancesInUse: sets.New[string](),
   130  		prepared:       make(map[ClaimID]any),
   131  	}
   132  
   133  	for i := 0; i < ex.fileOps.NumResourceInstances; i++ {
   134  		ex.instances.Insert(fmt.Sprintf("instance-%02d", i))
   135  	}
   136  
   137  	opts = append(opts,
   138  		kubeletplugin.Logger(logger),
   139  		kubeletplugin.DriverName(driverName),
   140  		kubeletplugin.GRPCInterceptor(ex.recordGRPCCall),
   141  		kubeletplugin.GRPCStreamInterceptor(ex.recordGRPCStream),
   142  	)
   143  	d, err := kubeletplugin.Start(ex, opts...)
   144  	if err != nil {
   145  		return nil, fmt.Errorf("start kubelet plugin: %w", err)
   146  	}
   147  	ex.d = d
   148  
   149  	return ex, nil
   150  }
   151  
   152  // stop ensures that all servers are stopped and resources freed.
   153  func (ex *ExamplePlugin) Stop() {
   154  	ex.d.Stop()
   155  }
   156  
   157  func (ex *ExamplePlugin) IsRegistered() bool {
   158  	status := ex.d.RegistrationStatus()
   159  	if status == nil {
   160  		return false
   161  	}
   162  	return status.PluginRegistered
   163  }
   164  
   165  // Block sets a flag to block Node[Un]PrepareResources
   166  // to emulate time consuming or stuck calls
   167  func (ex *ExamplePlugin) Block() {
   168  	ex.block = true
   169  }
   170  
   171  // NodePrepareResource ensures that the CDI file for the claim exists. It uses
   172  // a deterministic name to simplify NodeUnprepareResource (no need to remember
   173  // or discover the name) and idempotency (when called again, the file simply
   174  // gets written again).
   175  func (ex *ExamplePlugin) nodePrepareResource(ctx context.Context, claimName string, claimUID string, resourceHandle string, structuredResourceHandle []*resourceapi.StructuredResourceHandle) ([]string, error) {
   176  	logger := klog.FromContext(ctx)
   177  
   178  	// Block to emulate plugin stuckness or slowness.
   179  	// By default the call will not be blocked as ex.block = false.
   180  	if ex.block {
   181  		<-ctx.Done()
   182  		return nil, ctx.Err()
   183  	}
   184  
   185  	ex.mutex.Lock()
   186  	defer ex.mutex.Unlock()
   187  
   188  	deviceName := "claim-" + claimUID
   189  	vendor := ex.driverName
   190  	class := "test"
   191  	dev := vendor + "/" + class + "=" + deviceName
   192  	claimID := ClaimID{Name: claimName, UID: claimUID}
   193  	if _, ok := ex.prepared[claimID]; ok {
   194  		// Idempotent call, nothing to do.
   195  		return []string{dev}, nil
   196  	}
   197  
   198  	// Determine environment variables.
   199  	var p parameters
   200  	var actualResourceHandle any
   201  	var instanceNames []string
   202  	switch len(structuredResourceHandle) {
   203  	case 0:
   204  		// Control plane controller did the allocation.
   205  		if err := json.Unmarshal([]byte(resourceHandle), &p); err != nil {
   206  			return nil, fmt.Errorf("unmarshal resource handle: %w", err)
   207  		}
   208  		actualResourceHandle = resourceHandle
   209  	case 1:
   210  		// Scheduler did the allocation with structured parameters.
   211  		handle := structuredResourceHandle[0]
   212  		if handle == nil {
   213  			return nil, errors.New("unexpected nil StructuredResourceHandle")
   214  		}
   215  		p.NodeName = handle.NodeName
   216  		if err := extractParameters(handle.VendorClassParameters, &p.EnvVars, "admin"); err != nil {
   217  			return nil, err
   218  		}
   219  		if err := extractParameters(handle.VendorClaimParameters, &p.EnvVars, "user"); err != nil {
   220  			return nil, err
   221  		}
   222  		for _, result := range handle.Results {
   223  			if err := extractParameters(result.VendorRequestParameters, &p.EnvVars, "user"); err != nil {
   224  				return nil, err
   225  			}
   226  			namedResources := result.NamedResources
   227  			if namedResources == nil {
   228  				return nil, errors.New("missing named resources allocation result")
   229  			}
   230  			instanceName := namedResources.Name
   231  			if instanceName == "" {
   232  				return nil, errors.New("empty named resources instance name")
   233  			}
   234  			if !ex.instances.Has(instanceName) {
   235  				return nil, fmt.Errorf("unknown allocated instance %q", instanceName)
   236  			}
   237  			if ex.instancesInUse.Has(instanceName) {
   238  				return nil, fmt.Errorf("resource instance %q used more than once", instanceName)
   239  			}
   240  			instanceNames = append(instanceNames, instanceName)
   241  		}
   242  		actualResourceHandle = handle
   243  	default:
   244  		// Huh?
   245  		return nil, fmt.Errorf("invalid length of NodePrepareResourceRequest.StructuredResourceHandle: %d", len(structuredResourceHandle))
   246  	}
   247  
   248  	// Sanity check scheduling.
   249  	if p.NodeName != "" && ex.nodeName != "" && p.NodeName != ex.nodeName {
   250  		return nil, fmt.Errorf("claim was allocated for %q, cannot be prepared on %q", p.NodeName, ex.nodeName)
   251  	}
   252  
   253  	// CDI wants env variables as set of strings.
   254  	envs := []string{}
   255  	for key, val := range p.EnvVars {
   256  		envs = append(envs, key+"="+val)
   257  	}
   258  
   259  	spec := &spec{
   260  		Version: "0.3.0", // This has to be a version accepted by the runtimes.
   261  		Kind:    vendor + "/" + class,
   262  		// At least one device is required and its entry must have more
   263  		// than just the name.
   264  		Devices: []device{
   265  			{
   266  				Name: deviceName,
   267  				ContainerEdits: containerEdits{
   268  					Env: envs,
   269  				},
   270  			},
   271  		},
   272  	}
   273  	filePath := ex.getJSONFilePath(claimUID)
   274  	buffer, err := json.Marshal(spec)
   275  	if err != nil {
   276  		return nil, fmt.Errorf("marshal spec: %w", err)
   277  	}
   278  	if err := ex.fileOps.Create(filePath, buffer); err != nil {
   279  		return nil, fmt.Errorf("failed to write CDI file %v", err)
   280  	}
   281  
   282  	ex.prepared[claimID] = actualResourceHandle
   283  	for _, instanceName := range instanceNames {
   284  		ex.instancesInUse.Insert(instanceName)
   285  	}
   286  
   287  	logger.V(3).Info("CDI file created", "path", filePath, "device", dev)
   288  	return []string{dev}, nil
   289  }
   290  
   291  func extractParameters(parameters runtime.RawExtension, env *map[string]string, kind string) error {
   292  	if len(parameters.Raw) == 0 {
   293  		return nil
   294  	}
   295  	var data map[string]string
   296  	if err := json.Unmarshal(parameters.Raw, &data); err != nil {
   297  		return fmt.Errorf("decoding %s parameters: %v", kind, err)
   298  	}
   299  	if len(data) > 0 && *env == nil {
   300  		*env = make(map[string]string)
   301  	}
   302  	for key, value := range data {
   303  		(*env)[kind+"_"+key] = value
   304  	}
   305  	return nil
   306  }
   307  
   308  func (ex *ExamplePlugin) NodePrepareResources(ctx context.Context, req *drapbv1alpha3.NodePrepareResourcesRequest) (*drapbv1alpha3.NodePrepareResourcesResponse, error) {
   309  	resp := &drapbv1alpha3.NodePrepareResourcesResponse{
   310  		Claims: make(map[string]*drapbv1alpha3.NodePrepareResourceResponse),
   311  	}
   312  	for _, claimReq := range req.Claims {
   313  		cdiDevices, err := ex.nodePrepareResource(ctx, claimReq.Name, claimReq.Uid, claimReq.ResourceHandle, claimReq.StructuredResourceHandle)
   314  		if err != nil {
   315  			resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodePrepareResourceResponse{
   316  				Error: err.Error(),
   317  			}
   318  		} else {
   319  			resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodePrepareResourceResponse{
   320  				CDIDevices: cdiDevices,
   321  			}
   322  		}
   323  	}
   324  	return resp, nil
   325  }
   326  
   327  // NodeUnprepareResource removes the CDI file created by
   328  // NodePrepareResource. It's idempotent, therefore it is not an error when that
   329  // file is already gone.
   330  func (ex *ExamplePlugin) nodeUnprepareResource(ctx context.Context, claimName string, claimUID string, resourceHandle string, structuredResourceHandle []*resourceapi.StructuredResourceHandle) error {
   331  	logger := klog.FromContext(ctx)
   332  
   333  	// Block to emulate plugin stuckness or slowness.
   334  	// By default the call will not be blocked as ex.block = false.
   335  	if ex.block {
   336  		<-ctx.Done()
   337  		return ctx.Err()
   338  	}
   339  
   340  	filePath := ex.getJSONFilePath(claimUID)
   341  	if err := ex.fileOps.Remove(filePath); err != nil {
   342  		return fmt.Errorf("error removing CDI file: %w", err)
   343  	}
   344  	logger.V(3).Info("CDI file removed", "path", filePath)
   345  
   346  	ex.mutex.Lock()
   347  	defer ex.mutex.Unlock()
   348  
   349  	claimID := ClaimID{Name: claimName, UID: claimUID}
   350  	expectedResourceHandle, ok := ex.prepared[claimID]
   351  	if !ok {
   352  		// Idempotent call, nothing to do.
   353  		return nil
   354  	}
   355  
   356  	var actualResourceHandle any = resourceHandle
   357  	if structuredResourceHandle != nil {
   358  		if len(structuredResourceHandle) != 1 {
   359  			return fmt.Errorf("unexpected number of entries in StructuredResourceHandle: %d", len(structuredResourceHandle))
   360  		}
   361  		actualResourceHandle = structuredResourceHandle[0]
   362  	}
   363  	if diff := cmp.Diff(expectedResourceHandle, actualResourceHandle); diff != "" {
   364  		return fmt.Errorf("difference between expected (-) and actual resource handle (+):\n%s", diff)
   365  	}
   366  	delete(ex.prepared, claimID)
   367  	if structuredResourceHandle := structuredResourceHandle; structuredResourceHandle != nil {
   368  		for _, handle := range structuredResourceHandle {
   369  			for _, result := range handle.Results {
   370  				instanceName := result.NamedResources.Name
   371  				ex.instancesInUse.Delete(instanceName)
   372  			}
   373  		}
   374  	}
   375  	delete(ex.prepared, ClaimID{Name: claimName, UID: claimUID})
   376  
   377  	return nil
   378  }
   379  
   380  func (ex *ExamplePlugin) NodeUnprepareResources(ctx context.Context, req *drapbv1alpha3.NodeUnprepareResourcesRequest) (*drapbv1alpha3.NodeUnprepareResourcesResponse, error) {
   381  	resp := &drapbv1alpha3.NodeUnprepareResourcesResponse{
   382  		Claims: make(map[string]*drapbv1alpha3.NodeUnprepareResourceResponse),
   383  	}
   384  	for _, claimReq := range req.Claims {
   385  		err := ex.nodeUnprepareResource(ctx, claimReq.Name, claimReq.Uid, claimReq.ResourceHandle, claimReq.StructuredResourceHandle)
   386  		if err != nil {
   387  			resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodeUnprepareResourceResponse{
   388  				Error: err.Error(),
   389  			}
   390  		} else {
   391  			resp.Claims[claimReq.Uid] = &drapbv1alpha3.NodeUnprepareResourceResponse{}
   392  		}
   393  	}
   394  	return resp, nil
   395  }
   396  
   397  func (ex *ExamplePlugin) NodeListAndWatchResources(req *drapbv1alpha3.NodeListAndWatchResourcesRequest, stream drapbv1alpha3.Node_NodeListAndWatchResourcesServer) error {
   398  	if ex.fileOps.NumResourceInstances < 0 {
   399  		ex.logger.Info("Sending no NodeResourcesResponse")
   400  		return status.New(codes.Unimplemented, "node resource support disabled").Err()
   401  	}
   402  
   403  	instances := make([]resourceapi.NamedResourcesInstance, len(ex.instances))
   404  	for i, name := range sets.List(ex.instances) {
   405  		instances[i].Name = name
   406  	}
   407  	resp := &drapbv1alpha3.NodeListAndWatchResourcesResponse{
   408  		Resources: []*resourceapi.ResourceModel{
   409  			{
   410  				NamedResources: &resourceapi.NamedResourcesResources{
   411  					Instances: instances,
   412  				},
   413  			},
   414  		},
   415  	}
   416  
   417  	ex.logger.Info("Sending NodeListAndWatchResourcesResponse", "response", resp)
   418  	if err := stream.Send(resp); err != nil {
   419  		return err
   420  	}
   421  
   422  	// Keep the stream open until the test is done.
   423  	// TODO: test sending more updates later
   424  	<-ex.stopCh
   425  	ex.logger.Info("Done sending NodeListAndWatchResourcesResponse, closing stream")
   426  
   427  	return nil
   428  }
   429  
   430  func (ex *ExamplePlugin) GetPreparedResources() []ClaimID {
   431  	ex.mutex.Lock()
   432  	defer ex.mutex.Unlock()
   433  	var prepared []ClaimID
   434  	for claimID := range ex.prepared {
   435  		prepared = append(prepared, claimID)
   436  	}
   437  	return prepared
   438  }
   439  
   440  func (ex *ExamplePlugin) recordGRPCCall(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
   441  	call := GRPCCall{
   442  		FullMethod: info.FullMethod,
   443  		Request:    req,
   444  	}
   445  	ex.mutex.Lock()
   446  	ex.gRPCCalls = append(ex.gRPCCalls, call)
   447  	index := len(ex.gRPCCalls) - 1
   448  	ex.mutex.Unlock()
   449  
   450  	// We don't hold the mutex here to allow concurrent calls.
   451  	call.Response, call.Err = handler(ctx, req)
   452  
   453  	ex.mutex.Lock()
   454  	ex.gRPCCalls[index] = call
   455  	ex.mutex.Unlock()
   456  
   457  	return call.Response, call.Err
   458  }
   459  
   460  func (ex *ExamplePlugin) recordGRPCStream(srv interface{}, stream grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
   461  	call := GRPCCall{
   462  		FullMethod: info.FullMethod,
   463  	}
   464  	ex.mutex.Lock()
   465  	ex.gRPCCalls = append(ex.gRPCCalls, call)
   466  	index := len(ex.gRPCCalls) - 1
   467  	ex.mutex.Unlock()
   468  
   469  	// We don't hold the mutex here to allow concurrent calls.
   470  	call.Err = handler(srv, stream)
   471  
   472  	ex.mutex.Lock()
   473  	ex.gRPCCalls[index] = call
   474  	ex.mutex.Unlock()
   475  
   476  	return call.Err
   477  }
   478  
   479  func (ex *ExamplePlugin) GetGRPCCalls() []GRPCCall {
   480  	ex.mutex.Lock()
   481  	defer ex.mutex.Unlock()
   482  
   483  	// We must return a new slice, otherwise adding new calls would become
   484  	// visible to the caller. We also need to copy the entries because
   485  	// they get mutated by recordGRPCCall.
   486  	calls := make([]GRPCCall, 0, len(ex.gRPCCalls))
   487  	calls = append(calls, ex.gRPCCalls...)
   488  	return calls
   489  }