github.com/openshift/dpu-operator@v0.0.0-20240502153209-3af840d137c2/daemon/device-plugin/deviceplugin.go (about)

     1  package nfdeviceplugin
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"os"
     8  	"path/filepath"
     9  	"time"
    10  
    11  	"github.com/go-logr/logr"
    12  	pb "github.com/openshift/dpu-operator/dpu-api/gen"
    13  	"github.com/openshift/dpu-operator/dpu-cni/pkgs/cnitypes"
    14  	"google.golang.org/grpc"
    15  	"google.golang.org/grpc/credentials/insecure"
    16  	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
    17  	ctrl "sigs.k8s.io/controller-runtime"
    18  )
    19  
    20  const (
    21  	VendorPluginSocketPath string = cnitypes.DaemonBaseDir + "vendor-plugin/vendor-plugin.sock"
    22  
    23  	// Device plugin settings.
    24  	pluginMountPath = "/var/lib/kubelet/device-plugins"
    25  	kubeletEndpoint = "kubelet.sock"
    26  	pluginEndpoint  = "sriovNet.sock"
    27  	resourceName    = "openshift.io/dpu"
    28  )
    29  
    30  // sriovManager manages sriov networking devices
    31  type nfResources struct {
    32  	socketFile string
    33  	devices    map[string]pluginapi.Device // for Kubelet DP API
    34  	grpcServer *grpc.Server
    35  	pluginapi.DevicePluginServer
    36  	log    logr.Logger
    37  	client pb.DeviceServiceClient
    38  	conn   *grpc.ClientConn
    39  }
    40  
    41  type DevicePlugin interface {
    42  	Start() error
    43  }
    44  
    45  func (nf *nfResources) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error {
    46  	changed := true
    47  	for {
    48  		if changed {
    49  			resp := new(pluginapi.ListAndWatchResponse)
    50  			for _, dev := range nf.devices {
    51  				resp.Devices = append(resp.Devices, &pluginapi.Device{ID: dev.ID, Health: dev.Health})
    52  			}
    53  			fmt.Printf("ListAndWatch: send devices %v\n", resp)
    54  			if err := stream.Send(resp); err != nil {
    55  				fmt.Printf("Error. Cannot update device states: %v\n", err)
    56  				nf.grpcServer.Stop()
    57  				return err
    58  			}
    59  		}
    60  		time.Sleep(5 * time.Second)
    61  		changed = nf.Changed()
    62  	}
    63  }
    64  
    65  func (nf *nfResources) Changed() bool {
    66  	changed := false
    67  	for id, dev := range nf.devices {
    68  		state := nf.GetDeviceState(id)
    69  		if dev.Health != state {
    70  			changed = true
    71  			dev.Health = state
    72  			nf.devices[id] = dev
    73  		}
    74  	}
    75  	return changed
    76  }
    77  
    78  // Allocate passes the dev name as an env variable to the requesting container
    79  func (nf *nfResources) Allocate(ctx context.Context, rqt *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
    80  	resp := new(pluginapi.AllocateResponse)
    81  	devName := ""
    82  	for _, container := range rqt.ContainerRequests {
    83  		containerResp := new(pluginapi.ContainerAllocateResponse)
    84  		for _, id := range container.DevicesIDs {
    85  			fmt.Printf("DeviceID in Allocate: %v \n", id)
    86  			dev, ok := nf.devices[id]
    87  			if !ok {
    88  				fmt.Printf("Error. Invalid allocation request with non-existing device %s", id)
    89  			}
    90  			if dev.Health != pluginapi.Healthy {
    91  				fmt.Printf("Error. Invalid allocation request with unhealthy device %s", id)
    92  			}
    93  
    94  			devName = devName + id + ","
    95  		}
    96  
    97  		fmt.Printf("device(s) allocated: %s\n", devName)
    98  		envmap := make(map[string]string)
    99  		envmap["NF-DEV"] = devName
   100  
   101  		containerResp.Envs = envmap
   102  		resp.ContainerResponses = append(resp.ContainerResponses, containerResp)
   103  	}
   104  	return resp, nil
   105  }
   106  
   107  func (nf *nfResources) GetDeviceState(DeviceName string) string {
   108  	// TODO: Discover device health
   109  	return pluginapi.Healthy
   110  }
   111  
   112  func (nf *nfResources) Start() error {
   113  	nf.cleanup()
   114  	nf.ensureConnected()
   115  
   116  	ctx := context.Background()
   117  
   118  	Devices, err := nf.client.GetDevices(ctx, &pb.Empty{})
   119  	if err != nil {
   120  		nf.log.Error(err, "Failed to handle GetDevices Request")
   121  		return err
   122  	}
   123  
   124  	for _, device := range Devices.Devices {
   125  		nf.devices[device.ID] = pluginapi.Device{ID: device.ID, Health: pluginapi.Healthy}
   126  	}
   127  
   128  	for dev := range nf.devices {
   129  		nf.log.Info(dev)
   130  	}
   131  
   132  	pluginEndpoint := filepath.Join(pluginapi.DevicePluginPath, nf.socketFile)
   133  	fmt.Printf("Starting NF Device Plugin server at: %s\n", pluginEndpoint)
   134  	lis, err := net.Listen("unix", pluginEndpoint)
   135  	if err != nil {
   136  		fmt.Printf("Error: Starting NF Device Plugin server failed: %v", err)
   137  	}
   138  	nf.grpcServer = grpc.NewServer()
   139  
   140  	kubeletEndpoint := filepath.Join("unix:", DeprecatedSockDir, KubeEndPoint)
   141  
   142  	conn, err := grpc.Dial(kubeletEndpoint, grpc.WithTransportCredentials(insecure.NewCredentials()))
   143  
   144  	if err != nil {
   145  		fmt.Printf("%s device plugin unable connect to Kubelet : %v", resourceName, err)
   146  		return err
   147  	}
   148  	defer conn.Close()
   149  
   150  	pluginapi.RegisterDevicePluginServer(nf.grpcServer, nf)
   151  
   152  	client := pluginapi.NewRegistrationClient(conn)
   153  
   154  	go nf.grpcServer.Serve(lis)
   155  
   156  	// Use connectWithRetry for the pluginEndpoint call
   157  	conn, err = nf.connectWithRetry("unix:" + pluginEndpoint)
   158  	if err != nil {
   159  		fmt.Printf("error. unable to establish test connection with %s gRPC server: %v", resourceName, err)
   160  		return err
   161  	}
   162  	fmt.Printf("%s device plugin endpoint started serving \n", resourceName)
   163  	conn.Close()
   164  
   165  	ctx = context.Background()
   166  
   167  	request := &pluginapi.RegisterRequest{
   168  		Version:      pluginapi.Version,
   169  		Endpoint:     nf.socketFile,
   170  		ResourceName: resourceName,
   171  	}
   172  
   173  	if _, err = client.Register(ctx, request); err != nil {
   174  		fmt.Printf("%s device plugin unable to register with Kubelet : %v \n", resourceName, err)
   175  		return err
   176  	}
   177  	fmt.Printf("%s device plugin registered with Kubelet\n", resourceName)
   178  
   179  	return nil
   180  }
   181  
   182  // connectWithRetry tries to establish a connection with the given endpoint, with retries.
   183  func (nf *nfResources) connectWithRetry(endpoint string) (*grpc.ClientConn, error) {
   184  	var conn *grpc.ClientConn
   185  	var err error
   186  
   187  	retryPolicy := `{
   188  		"methodConfig": [{
   189  		  "waitForReady": true,
   190  		  "retryPolicy": {
   191  			  "MaxAttempts": 40,
   192  			  "InitialBackoff": "1s",
   193  			  "MaxBackoff": "16s",
   194  			  "BackoffMultiplier": 2.0,
   195  			  "RetryableStatusCodes": [ "UNAVAILABLE" ]
   196  		  }
   197  		}]}`
   198  
   199  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   200  	defer cancel()
   201  
   202  	conn, err = grpc.DialContext(
   203  		ctx,
   204  		endpoint,
   205  		grpc.WithTransportCredentials(insecure.NewCredentials()),
   206  		grpc.WithBlock(),
   207  		grpc.WithDefaultServiceConfig(retryPolicy),
   208  	)
   209  	if err != nil {
   210  		nf.log.Error(err, "Failed to establish connection with retry", "endpoint", endpoint)
   211  		return nil, err
   212  	}
   213  
   214  	return conn, nil
   215  }
   216  
   217  func (g *nfResources) ensureConnected() error {
   218  	if g.client != nil {
   219  		return nil
   220  	}
   221  	dialOptions := []grpc.DialOption{
   222  		grpc.WithTransportCredentials(insecure.NewCredentials()),
   223  		grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
   224  			return net.Dial("unix", addr)
   225  		}),
   226  	}
   227  
   228  	conn, err := grpc.DialContext(context.Background(), VendorPluginSocketPath, dialOptions...)
   229  
   230  	if err != nil {
   231  		g.log.Error(err, "Failed to connect to vendor plugin")
   232  		return err
   233  	}
   234  	g.conn = conn
   235  
   236  	g.client = pb.NewDeviceServiceClient(conn)
   237  	return nil
   238  }
   239  
   240  // func (nf *nfResources) Stop() error {
   241  // 	fmt.Printf("Stopping Device Plugin gRPC server..")
   242  // 	if nf.grpcServer == nil {
   243  // 		return nil
   244  // 	}
   245  
   246  // 	nf.grpcServer.Stop()
   247  // 	nf.grpcServer = nil
   248  
   249  // 	return nf.cleanup()
   250  // }
   251  
   252  func (nf *nfResources) cleanup() error {
   253  	pluginEndpoint := filepath.Join(pluginapi.DevicePluginPath, nf.socketFile)
   254  	if err := os.Remove(pluginEndpoint); err != nil && !os.IsNotExist(err) {
   255  		return err
   256  	}
   257  
   258  	return nil
   259  }
   260  
   261  func (nf *nfResources) PreStartContainer(ctx context.Context, psRqt *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
   262  	return &pluginapi.PreStartContainerResponse{}, nil
   263  }
   264  
   265  func (nf *nfResources) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
   266  	return &pluginapi.DevicePluginOptions{
   267  		PreStartRequired: false,
   268  	}, nil
   269  }
   270  
   271  func NewGrpcPlugin() *nfResources {
   272  	return &nfResources{
   273  		log:        ctrl.Log.WithName("GrpcPlugin"),
   274  		devices:    make(map[string]pluginapi.Device),
   275  		socketFile: pluginEndpoint,
   276  	}
   277  }