github.com/openshift/dpu-operator@v0.0.0-20240502153209-3af840d137c2/daemon/device-plugin/deviceplugin.go (about) 1 package nfdeviceplugin 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "os" 8 "path/filepath" 9 "time" 10 11 "github.com/go-logr/logr" 12 pb "github.com/openshift/dpu-operator/dpu-api/gen" 13 "github.com/openshift/dpu-operator/dpu-cni/pkgs/cnitypes" 14 "google.golang.org/grpc" 15 "google.golang.org/grpc/credentials/insecure" 16 pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 17 ctrl "sigs.k8s.io/controller-runtime" 18 ) 19 20 const ( 21 VendorPluginSocketPath string = cnitypes.DaemonBaseDir + "vendor-plugin/vendor-plugin.sock" 22 23 // Device plugin settings. 24 pluginMountPath = "/var/lib/kubelet/device-plugins" 25 kubeletEndpoint = "kubelet.sock" 26 pluginEndpoint = "sriovNet.sock" 27 resourceName = "openshift.io/dpu" 28 ) 29 30 // sriovManager manages sriov networking devices 31 type nfResources struct { 32 socketFile string 33 devices map[string]pluginapi.Device // for Kubelet DP API 34 grpcServer *grpc.Server 35 pluginapi.DevicePluginServer 36 log logr.Logger 37 client pb.DeviceServiceClient 38 conn *grpc.ClientConn 39 } 40 41 type DevicePlugin interface { 42 Start() error 43 } 44 45 func (nf *nfResources) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error { 46 changed := true 47 for { 48 if changed { 49 resp := new(pluginapi.ListAndWatchResponse) 50 for _, dev := range nf.devices { 51 resp.Devices = append(resp.Devices, &pluginapi.Device{ID: dev.ID, Health: dev.Health}) 52 } 53 fmt.Printf("ListAndWatch: send devices %v\n", resp) 54 if err := stream.Send(resp); err != nil { 55 fmt.Printf("Error. Cannot update device states: %v\n", err) 56 nf.grpcServer.Stop() 57 return err 58 } 59 } 60 time.Sleep(5 * time.Second) 61 changed = nf.Changed() 62 } 63 } 64 65 func (nf *nfResources) Changed() bool { 66 changed := false 67 for id, dev := range nf.devices { 68 state := nf.GetDeviceState(id) 69 if dev.Health != state { 70 changed = true 71 dev.Health = state 72 nf.devices[id] = dev 73 } 74 } 75 return changed 76 } 77 78 // Allocate passes the dev name as an env variable to the requesting container 79 func (nf *nfResources) Allocate(ctx context.Context, rqt *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { 80 resp := new(pluginapi.AllocateResponse) 81 devName := "" 82 for _, container := range rqt.ContainerRequests { 83 containerResp := new(pluginapi.ContainerAllocateResponse) 84 for _, id := range container.DevicesIDs { 85 fmt.Printf("DeviceID in Allocate: %v \n", id) 86 dev, ok := nf.devices[id] 87 if !ok { 88 fmt.Printf("Error. Invalid allocation request with non-existing device %s", id) 89 } 90 if dev.Health != pluginapi.Healthy { 91 fmt.Printf("Error. Invalid allocation request with unhealthy device %s", id) 92 } 93 94 devName = devName + id + "," 95 } 96 97 fmt.Printf("device(s) allocated: %s\n", devName) 98 envmap := make(map[string]string) 99 envmap["NF-DEV"] = devName 100 101 containerResp.Envs = envmap 102 resp.ContainerResponses = append(resp.ContainerResponses, containerResp) 103 } 104 return resp, nil 105 } 106 107 func (nf *nfResources) GetDeviceState(DeviceName string) string { 108 // TODO: Discover device health 109 return pluginapi.Healthy 110 } 111 112 func (nf *nfResources) Start() error { 113 nf.cleanup() 114 nf.ensureConnected() 115 116 ctx := context.Background() 117 118 Devices, err := nf.client.GetDevices(ctx, &pb.Empty{}) 119 if err != nil { 120 nf.log.Error(err, "Failed to handle GetDevices Request") 121 return err 122 } 123 124 for _, device := range Devices.Devices { 125 nf.devices[device.ID] = pluginapi.Device{ID: device.ID, Health: pluginapi.Healthy} 126 } 127 128 for dev := range nf.devices { 129 nf.log.Info(dev) 130 } 131 132 pluginEndpoint := filepath.Join(pluginapi.DevicePluginPath, nf.socketFile) 133 fmt.Printf("Starting NF Device Plugin server at: %s\n", pluginEndpoint) 134 lis, err := net.Listen("unix", pluginEndpoint) 135 if err != nil { 136 fmt.Printf("Error: Starting NF Device Plugin server failed: %v", err) 137 } 138 nf.grpcServer = grpc.NewServer() 139 140 kubeletEndpoint := filepath.Join("unix:", DeprecatedSockDir, KubeEndPoint) 141 142 conn, err := grpc.Dial(kubeletEndpoint, grpc.WithTransportCredentials(insecure.NewCredentials())) 143 144 if err != nil { 145 fmt.Printf("%s device plugin unable connect to Kubelet : %v", resourceName, err) 146 return err 147 } 148 defer conn.Close() 149 150 pluginapi.RegisterDevicePluginServer(nf.grpcServer, nf) 151 152 client := pluginapi.NewRegistrationClient(conn) 153 154 go nf.grpcServer.Serve(lis) 155 156 // Use connectWithRetry for the pluginEndpoint call 157 conn, err = nf.connectWithRetry("unix:" + pluginEndpoint) 158 if err != nil { 159 fmt.Printf("error. unable to establish test connection with %s gRPC server: %v", resourceName, err) 160 return err 161 } 162 fmt.Printf("%s device plugin endpoint started serving \n", resourceName) 163 conn.Close() 164 165 ctx = context.Background() 166 167 request := &pluginapi.RegisterRequest{ 168 Version: pluginapi.Version, 169 Endpoint: nf.socketFile, 170 ResourceName: resourceName, 171 } 172 173 if _, err = client.Register(ctx, request); err != nil { 174 fmt.Printf("%s device plugin unable to register with Kubelet : %v \n", resourceName, err) 175 return err 176 } 177 fmt.Printf("%s device plugin registered with Kubelet\n", resourceName) 178 179 return nil 180 } 181 182 // connectWithRetry tries to establish a connection with the given endpoint, with retries. 183 func (nf *nfResources) connectWithRetry(endpoint string) (*grpc.ClientConn, error) { 184 var conn *grpc.ClientConn 185 var err error 186 187 retryPolicy := `{ 188 "methodConfig": [{ 189 "waitForReady": true, 190 "retryPolicy": { 191 "MaxAttempts": 40, 192 "InitialBackoff": "1s", 193 "MaxBackoff": "16s", 194 "BackoffMultiplier": 2.0, 195 "RetryableStatusCodes": [ "UNAVAILABLE" ] 196 } 197 }]}` 198 199 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 200 defer cancel() 201 202 conn, err = grpc.DialContext( 203 ctx, 204 endpoint, 205 grpc.WithTransportCredentials(insecure.NewCredentials()), 206 grpc.WithBlock(), 207 grpc.WithDefaultServiceConfig(retryPolicy), 208 ) 209 if err != nil { 210 nf.log.Error(err, "Failed to establish connection with retry", "endpoint", endpoint) 211 return nil, err 212 } 213 214 return conn, nil 215 } 216 217 func (g *nfResources) ensureConnected() error { 218 if g.client != nil { 219 return nil 220 } 221 dialOptions := []grpc.DialOption{ 222 grpc.WithTransportCredentials(insecure.NewCredentials()), 223 grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { 224 return net.Dial("unix", addr) 225 }), 226 } 227 228 conn, err := grpc.DialContext(context.Background(), VendorPluginSocketPath, dialOptions...) 229 230 if err != nil { 231 g.log.Error(err, "Failed to connect to vendor plugin") 232 return err 233 } 234 g.conn = conn 235 236 g.client = pb.NewDeviceServiceClient(conn) 237 return nil 238 } 239 240 // func (nf *nfResources) Stop() error { 241 // fmt.Printf("Stopping Device Plugin gRPC server..") 242 // if nf.grpcServer == nil { 243 // return nil 244 // } 245 246 // nf.grpcServer.Stop() 247 // nf.grpcServer = nil 248 249 // return nf.cleanup() 250 // } 251 252 func (nf *nfResources) cleanup() error { 253 pluginEndpoint := filepath.Join(pluginapi.DevicePluginPath, nf.socketFile) 254 if err := os.Remove(pluginEndpoint); err != nil && !os.IsNotExist(err) { 255 return err 256 } 257 258 return nil 259 } 260 261 func (nf *nfResources) PreStartContainer(ctx context.Context, psRqt *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { 262 return &pluginapi.PreStartContainerResponse{}, nil 263 } 264 265 func (nf *nfResources) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { 266 return &pluginapi.DevicePluginOptions{ 267 PreStartRequired: false, 268 }, nil 269 } 270 271 func NewGrpcPlugin() *nfResources { 272 return &nfResources{ 273 log: ctrl.Log.WithName("GrpcPlugin"), 274 devices: make(map[string]pluginapi.Device), 275 socketFile: pluginEndpoint, 276 } 277 }