k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/devicemanager/plugin/v1beta1/stub.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package v1beta1 18 19 import ( 20 "context" 21 "net" 22 "os" 23 "path/filepath" 24 "sync" 25 "time" 26 27 "github.com/fsnotify/fsnotify" 28 "google.golang.org/grpc" 29 "google.golang.org/grpc/credentials/insecure" 30 31 "k8s.io/apimachinery/pkg/util/wait" 32 "k8s.io/klog/v2" 33 pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 34 watcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1" 35 ) 36 37 // Stub implementation for DevicePlugin. 38 type Stub struct { 39 devs []*pluginapi.Device 40 socket string 41 resourceName string 42 preStartContainerFlag bool 43 getPreferredAllocationFlag bool 44 45 stop chan interface{} 46 wg sync.WaitGroup 47 update chan []*pluginapi.Device 48 49 server *grpc.Server 50 51 // allocFunc is used for handling allocation request 52 allocFunc stubAllocFunc 53 54 // getPreferredAllocFunc is used for handling getPreferredAllocation request 55 getPreferredAllocFunc stubGetPreferredAllocFunc 56 57 // registerControlFunc is used for controlling auto-registration of requests 58 registerControlFunc stubRegisterControlFunc 59 60 registrationStatus chan watcherapi.RegistrationStatus // for testing 61 endpoint string // for testing 62 63 kubeletRestartWatcher *fsnotify.Watcher 64 } 65 66 // stubGetPreferredAllocFunc is the function called when a getPreferredAllocation request is received from Kubelet 67 type stubGetPreferredAllocFunc func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) 68 69 func defaultGetPreferredAllocFunc(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) { 70 var response pluginapi.PreferredAllocationResponse 71 72 return &response, nil 73 } 74 75 // stubAllocFunc is the function called when an allocation request is received from Kubelet 76 type stubAllocFunc func(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) 77 78 func defaultAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) { 79 var response pluginapi.AllocateResponse 80 81 return &response, nil 82 } 83 84 // stubRegisterControlFunc is the function called when a registration request is received from Kubelet 85 type stubRegisterControlFunc func() bool 86 87 func defaultRegisterControlFunc() bool { 88 return true 89 } 90 91 // NewDevicePluginStub returns an initialized DevicePlugin Stub. 92 func NewDevicePluginStub(devs []*pluginapi.Device, socket string, name string, preStartContainerFlag bool, getPreferredAllocationFlag bool) *Stub { 93 94 watcher, err := fsnotify.NewWatcher() 95 if err != nil { 96 klog.ErrorS(err, "Watcher creation failed") 97 panic(err) 98 } 99 100 return &Stub{ 101 devs: devs, 102 socket: socket, 103 resourceName: name, 104 preStartContainerFlag: preStartContainerFlag, 105 getPreferredAllocationFlag: getPreferredAllocationFlag, 106 registerControlFunc: defaultRegisterControlFunc, 107 108 stop: make(chan interface{}), 109 update: make(chan []*pluginapi.Device), 110 111 allocFunc: defaultAllocFunc, 112 getPreferredAllocFunc: defaultGetPreferredAllocFunc, 113 kubeletRestartWatcher: watcher, 114 } 115 } 116 117 // SetGetPreferredAllocFunc sets allocFunc of the device plugin 118 func (m *Stub) SetGetPreferredAllocFunc(f stubGetPreferredAllocFunc) { 119 m.getPreferredAllocFunc = f 120 } 121 122 // SetAllocFunc sets allocFunc of the device plugin 123 func (m *Stub) SetAllocFunc(f stubAllocFunc) { 124 m.allocFunc = f 125 } 126 127 // SetRegisterControlFunc sets RegisterControlFunc of the device plugin 128 func (m *Stub) SetRegisterControlFunc(f stubRegisterControlFunc) { 129 m.registerControlFunc = f 130 } 131 132 // Start starts the gRPC server of the device plugin. Can only 133 // be called once. 134 func (m *Stub) Start() error { 135 klog.InfoS("Starting device plugin server") 136 err := m.cleanup() 137 if err != nil { 138 return err 139 } 140 141 sock, err := net.Listen("unix", m.socket) 142 if err != nil { 143 return err 144 } 145 146 m.wg.Add(1) 147 m.server = grpc.NewServer([]grpc.ServerOption{}...) 148 pluginapi.RegisterDevicePluginServer(m.server, m) 149 watcherapi.RegisterRegistrationServer(m.server, m) 150 151 err = m.kubeletRestartWatcher.Add(filepath.Dir(m.socket)) 152 if err != nil { 153 klog.ErrorS(err, "Failed to add watch", "devicePluginPath", pluginapi.DevicePluginPath) 154 return err 155 } 156 157 go func() { 158 defer m.wg.Done() 159 m.server.Serve(sock) 160 }() 161 162 var lastDialErr error 163 wait.PollImmediate(1*time.Second, 10*time.Second, func() (bool, error) { 164 var conn *grpc.ClientConn 165 _, conn, lastDialErr = dial(m.socket) 166 if lastDialErr != nil { 167 return false, nil 168 } 169 conn.Close() 170 return true, nil 171 }) 172 if lastDialErr != nil { 173 return lastDialErr 174 } 175 176 klog.InfoS("Starting to serve on socket", "socket", m.socket) 177 return nil 178 } 179 180 func (m *Stub) Restart() error { 181 klog.InfoS("Restarting Device Plugin server") 182 if m.server == nil { 183 return nil 184 } 185 186 m.server.Stop() 187 m.server = nil 188 189 return m.Start() 190 } 191 192 // Stop stops the gRPC server. Can be called without a prior Start 193 // and more than once. Not safe to be called concurrently by different 194 // goroutines! 195 func (m *Stub) Stop() error { 196 klog.InfoS("Stopping device plugin server") 197 if m.server == nil { 198 return nil 199 } 200 201 m.kubeletRestartWatcher.Close() 202 203 m.server.Stop() 204 m.wg.Wait() 205 m.server = nil 206 close(m.stop) // This prevents re-starting the server. 207 208 return m.cleanup() 209 } 210 211 func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) { 212 for { 213 select { 214 // Detect a kubelet restart by watching for a newly created 215 // 'pluginapi.KubeletSocket' file. When this occurs, restart 216 // the device plugin server 217 case event := <-m.kubeletRestartWatcher.Events: 218 if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create { 219 klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint) 220 var lastErr error 221 222 err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) { 223 restartErr := m.Restart() 224 if restartErr == nil { 225 return true, nil 226 } 227 klog.ErrorS(restartErr, "Retrying after error") 228 lastErr = restartErr 229 return false, nil 230 }) 231 if err != nil { 232 klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error()) 233 panic(err) 234 } 235 236 if ok := m.registerControlFunc(); ok { 237 if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil { 238 klog.ErrorS(err, "Unable to register to kubelet") 239 panic(err) 240 } 241 } 242 } 243 244 // Watch for any other fs errors and log them. 245 case err := <-m.kubeletRestartWatcher.Errors: 246 klog.ErrorS(err, "inotify error") 247 } 248 } 249 } 250 251 // GetInfo is the RPC which return pluginInfo 252 func (m *Stub) GetInfo(ctx context.Context, req *watcherapi.InfoRequest) (*watcherapi.PluginInfo, error) { 253 klog.InfoS("GetInfo") 254 return &watcherapi.PluginInfo{ 255 Type: watcherapi.DevicePlugin, 256 Name: m.resourceName, 257 Endpoint: m.endpoint, 258 SupportedVersions: []string{pluginapi.Version}}, nil 259 } 260 261 // NotifyRegistrationStatus receives the registration notification from watcher 262 func (m *Stub) NotifyRegistrationStatus(ctx context.Context, status *watcherapi.RegistrationStatus) (*watcherapi.RegistrationStatusResponse, error) { 263 if m.registrationStatus != nil { 264 m.registrationStatus <- *status 265 } 266 if !status.PluginRegistered { 267 klog.InfoS("Registration failed", "err", status.Error) 268 } 269 return &watcherapi.RegistrationStatusResponse{}, nil 270 } 271 272 // Register registers the device plugin for the given resourceName with Kubelet. 273 func (m *Stub) Register(kubeletEndpoint, resourceName string, pluginSockDir string) error { 274 klog.InfoS("Register", "kubeletEndpoint", kubeletEndpoint, "resourceName", resourceName, "socket", pluginSockDir) 275 276 if pluginSockDir != "" { 277 if _, err := os.Stat(pluginSockDir + "DEPRECATION"); err == nil { 278 klog.InfoS("Deprecation file found. Skip registration") 279 return nil 280 } 281 } 282 klog.InfoS("Deprecation file not found. Invoke registration") 283 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 284 defer cancel() 285 286 conn, err := grpc.DialContext(ctx, kubeletEndpoint, 287 grpc.WithTransportCredentials(insecure.NewCredentials()), 288 grpc.WithBlock(), 289 grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { 290 return (&net.Dialer{}).DialContext(ctx, "unix", addr) 291 })) 292 if err != nil { 293 return err 294 } 295 defer conn.Close() 296 client := pluginapi.NewRegistrationClient(conn) 297 reqt := &pluginapi.RegisterRequest{ 298 Version: pluginapi.Version, 299 Endpoint: filepath.Base(m.socket), 300 ResourceName: resourceName, 301 Options: &pluginapi.DevicePluginOptions{ 302 PreStartRequired: m.preStartContainerFlag, 303 GetPreferredAllocationAvailable: m.getPreferredAllocationFlag, 304 }, 305 } 306 307 _, err = client.Register(context.Background(), reqt) 308 if err != nil { 309 // Stop server 310 m.server.Stop() 311 klog.ErrorS(err, "Client unable to register to kubelet") 312 return err 313 } 314 klog.InfoS("Device Plugin registered with the Kubelet") 315 return err 316 } 317 318 // GetDevicePluginOptions returns DevicePluginOptions settings for the device plugin. 319 func (m *Stub) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { 320 options := &pluginapi.DevicePluginOptions{ 321 PreStartRequired: m.preStartContainerFlag, 322 GetPreferredAllocationAvailable: m.getPreferredAllocationFlag, 323 } 324 return options, nil 325 } 326 327 // PreStartContainer resets the devices received 328 func (m *Stub) PreStartContainer(ctx context.Context, r *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { 329 klog.InfoS("PreStartContainer", "request", r) 330 return &pluginapi.PreStartContainerResponse{}, nil 331 } 332 333 // ListAndWatch lists devices and update that list according to the Update call 334 func (m *Stub) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { 335 klog.InfoS("ListAndWatch") 336 337 s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) 338 339 for { 340 select { 341 case <-m.stop: 342 return nil 343 case updated := <-m.update: 344 s.Send(&pluginapi.ListAndWatchResponse{Devices: updated}) 345 } 346 } 347 } 348 349 // Update allows the device plugin to send new devices through ListAndWatch 350 func (m *Stub) Update(devs []*pluginapi.Device) { 351 m.update <- devs 352 } 353 354 // GetPreferredAllocation gets the preferred allocation from a set of available devices 355 func (m *Stub) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { 356 klog.InfoS("GetPreferredAllocation", "request", r) 357 358 devs := make(map[string]pluginapi.Device) 359 360 for _, dev := range m.devs { 361 devs[dev.ID] = *dev 362 } 363 364 return m.getPreferredAllocFunc(r, devs) 365 } 366 367 // Allocate does a mock allocation 368 func (m *Stub) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { 369 klog.InfoS("Allocate", "request", r) 370 371 devs := make(map[string]pluginapi.Device) 372 373 for _, dev := range m.devs { 374 devs[dev.ID] = *dev 375 } 376 377 return m.allocFunc(r, devs) 378 } 379 380 func (m *Stub) cleanup() error { 381 if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { 382 return err 383 } 384 385 return nil 386 }