k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/devicemanager/plugin/v1beta1/stub.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package v1beta1
    18  
    19  import (
    20  	"context"
    21  	"net"
    22  	"os"
    23  	"path/filepath"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/fsnotify/fsnotify"
    28  	"google.golang.org/grpc"
    29  	"google.golang.org/grpc/credentials/insecure"
    30  
    31  	"k8s.io/apimachinery/pkg/util/wait"
    32  	"k8s.io/klog/v2"
    33  	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
    34  	watcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
    35  )
    36  
    37  // Stub implementation for DevicePlugin.
    38  type Stub struct {
    39  	devs                       []*pluginapi.Device
    40  	socket                     string
    41  	resourceName               string
    42  	preStartContainerFlag      bool
    43  	getPreferredAllocationFlag bool
    44  
    45  	stop   chan interface{}
    46  	wg     sync.WaitGroup
    47  	update chan []*pluginapi.Device
    48  
    49  	server *grpc.Server
    50  
    51  	// allocFunc is used for handling allocation request
    52  	allocFunc stubAllocFunc
    53  
    54  	// getPreferredAllocFunc is used for handling getPreferredAllocation request
    55  	getPreferredAllocFunc stubGetPreferredAllocFunc
    56  
    57  	// registerControlFunc is used for controlling auto-registration of requests
    58  	registerControlFunc stubRegisterControlFunc
    59  
    60  	registrationStatus chan watcherapi.RegistrationStatus // for testing
    61  	endpoint           string                             // for testing
    62  
    63  	kubeletRestartWatcher *fsnotify.Watcher
    64  }
    65  
    66  // stubGetPreferredAllocFunc is the function called when a getPreferredAllocation request is received from Kubelet
    67  type stubGetPreferredAllocFunc func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error)
    68  
    69  func defaultGetPreferredAllocFunc(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) {
    70  	var response pluginapi.PreferredAllocationResponse
    71  
    72  	return &response, nil
    73  }
    74  
    75  // stubAllocFunc is the function called when an allocation request is received from Kubelet
    76  type stubAllocFunc func(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error)
    77  
    78  func defaultAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
    79  	var response pluginapi.AllocateResponse
    80  
    81  	return &response, nil
    82  }
    83  
    84  // stubRegisterControlFunc is the function called when a registration request is received from Kubelet
    85  type stubRegisterControlFunc func() bool
    86  
    87  func defaultRegisterControlFunc() bool {
    88  	return true
    89  }
    90  
    91  // NewDevicePluginStub returns an initialized DevicePlugin Stub.
    92  func NewDevicePluginStub(devs []*pluginapi.Device, socket string, name string, preStartContainerFlag bool, getPreferredAllocationFlag bool) *Stub {
    93  
    94  	watcher, err := fsnotify.NewWatcher()
    95  	if err != nil {
    96  		klog.ErrorS(err, "Watcher creation failed")
    97  		panic(err)
    98  	}
    99  
   100  	return &Stub{
   101  		devs:                       devs,
   102  		socket:                     socket,
   103  		resourceName:               name,
   104  		preStartContainerFlag:      preStartContainerFlag,
   105  		getPreferredAllocationFlag: getPreferredAllocationFlag,
   106  		registerControlFunc:        defaultRegisterControlFunc,
   107  
   108  		stop:   make(chan interface{}),
   109  		update: make(chan []*pluginapi.Device),
   110  
   111  		allocFunc:             defaultAllocFunc,
   112  		getPreferredAllocFunc: defaultGetPreferredAllocFunc,
   113  		kubeletRestartWatcher: watcher,
   114  	}
   115  }
   116  
   117  // SetGetPreferredAllocFunc sets allocFunc of the device plugin
   118  func (m *Stub) SetGetPreferredAllocFunc(f stubGetPreferredAllocFunc) {
   119  	m.getPreferredAllocFunc = f
   120  }
   121  
   122  // SetAllocFunc sets allocFunc of the device plugin
   123  func (m *Stub) SetAllocFunc(f stubAllocFunc) {
   124  	m.allocFunc = f
   125  }
   126  
   127  // SetRegisterControlFunc sets RegisterControlFunc of the device plugin
   128  func (m *Stub) SetRegisterControlFunc(f stubRegisterControlFunc) {
   129  	m.registerControlFunc = f
   130  }
   131  
   132  // Start starts the gRPC server of the device plugin. Can only
   133  // be called once.
   134  func (m *Stub) Start() error {
   135  	klog.InfoS("Starting device plugin server")
   136  	err := m.cleanup()
   137  	if err != nil {
   138  		return err
   139  	}
   140  
   141  	sock, err := net.Listen("unix", m.socket)
   142  	if err != nil {
   143  		return err
   144  	}
   145  
   146  	m.wg.Add(1)
   147  	m.server = grpc.NewServer([]grpc.ServerOption{}...)
   148  	pluginapi.RegisterDevicePluginServer(m.server, m)
   149  	watcherapi.RegisterRegistrationServer(m.server, m)
   150  
   151  	err = m.kubeletRestartWatcher.Add(filepath.Dir(m.socket))
   152  	if err != nil {
   153  		klog.ErrorS(err, "Failed to add watch", "devicePluginPath", pluginapi.DevicePluginPath)
   154  		return err
   155  	}
   156  
   157  	go func() {
   158  		defer m.wg.Done()
   159  		m.server.Serve(sock)
   160  	}()
   161  
   162  	var lastDialErr error
   163  	wait.PollImmediate(1*time.Second, 10*time.Second, func() (bool, error) {
   164  		var conn *grpc.ClientConn
   165  		_, conn, lastDialErr = dial(m.socket)
   166  		if lastDialErr != nil {
   167  			return false, nil
   168  		}
   169  		conn.Close()
   170  		return true, nil
   171  	})
   172  	if lastDialErr != nil {
   173  		return lastDialErr
   174  	}
   175  
   176  	klog.InfoS("Starting to serve on socket", "socket", m.socket)
   177  	return nil
   178  }
   179  
   180  func (m *Stub) Restart() error {
   181  	klog.InfoS("Restarting Device Plugin server")
   182  	if m.server == nil {
   183  		return nil
   184  	}
   185  
   186  	m.server.Stop()
   187  	m.server = nil
   188  
   189  	return m.Start()
   190  }
   191  
   192  // Stop stops the gRPC server. Can be called without a prior Start
   193  // and more than once. Not safe to be called concurrently by different
   194  // goroutines!
   195  func (m *Stub) Stop() error {
   196  	klog.InfoS("Stopping device plugin server")
   197  	if m.server == nil {
   198  		return nil
   199  	}
   200  
   201  	m.kubeletRestartWatcher.Close()
   202  
   203  	m.server.Stop()
   204  	m.wg.Wait()
   205  	m.server = nil
   206  	close(m.stop) // This prevents re-starting the server.
   207  
   208  	return m.cleanup()
   209  }
   210  
   211  func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) {
   212  	for {
   213  		select {
   214  		// Detect a kubelet restart by watching for a newly created
   215  		// 'pluginapi.KubeletSocket' file. When this occurs, restart
   216  		// the device plugin server
   217  		case event := <-m.kubeletRestartWatcher.Events:
   218  			if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create {
   219  				klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint)
   220  				var lastErr error
   221  
   222  				err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) {
   223  					restartErr := m.Restart()
   224  					if restartErr == nil {
   225  						return true, nil
   226  					}
   227  					klog.ErrorS(restartErr, "Retrying after error")
   228  					lastErr = restartErr
   229  					return false, nil
   230  				})
   231  				if err != nil {
   232  					klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error())
   233  					panic(err)
   234  				}
   235  
   236  				if ok := m.registerControlFunc(); ok {
   237  					if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil {
   238  						klog.ErrorS(err, "Unable to register to kubelet")
   239  						panic(err)
   240  					}
   241  				}
   242  			}
   243  
   244  		// Watch for any other fs errors and log them.
   245  		case err := <-m.kubeletRestartWatcher.Errors:
   246  			klog.ErrorS(err, "inotify error")
   247  		}
   248  	}
   249  }
   250  
   251  // GetInfo is the RPC which return pluginInfo
   252  func (m *Stub) GetInfo(ctx context.Context, req *watcherapi.InfoRequest) (*watcherapi.PluginInfo, error) {
   253  	klog.InfoS("GetInfo")
   254  	return &watcherapi.PluginInfo{
   255  		Type:              watcherapi.DevicePlugin,
   256  		Name:              m.resourceName,
   257  		Endpoint:          m.endpoint,
   258  		SupportedVersions: []string{pluginapi.Version}}, nil
   259  }
   260  
   261  // NotifyRegistrationStatus receives the registration notification from watcher
   262  func (m *Stub) NotifyRegistrationStatus(ctx context.Context, status *watcherapi.RegistrationStatus) (*watcherapi.RegistrationStatusResponse, error) {
   263  	if m.registrationStatus != nil {
   264  		m.registrationStatus <- *status
   265  	}
   266  	if !status.PluginRegistered {
   267  		klog.InfoS("Registration failed", "err", status.Error)
   268  	}
   269  	return &watcherapi.RegistrationStatusResponse{}, nil
   270  }
   271  
   272  // Register registers the device plugin for the given resourceName with Kubelet.
   273  func (m *Stub) Register(kubeletEndpoint, resourceName string, pluginSockDir string) error {
   274  	klog.InfoS("Register", "kubeletEndpoint", kubeletEndpoint, "resourceName", resourceName, "socket", pluginSockDir)
   275  
   276  	if pluginSockDir != "" {
   277  		if _, err := os.Stat(pluginSockDir + "DEPRECATION"); err == nil {
   278  			klog.InfoS("Deprecation file found. Skip registration")
   279  			return nil
   280  		}
   281  	}
   282  	klog.InfoS("Deprecation file not found. Invoke registration")
   283  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   284  	defer cancel()
   285  
   286  	conn, err := grpc.DialContext(ctx, kubeletEndpoint,
   287  		grpc.WithTransportCredentials(insecure.NewCredentials()),
   288  		grpc.WithBlock(),
   289  		grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
   290  			return (&net.Dialer{}).DialContext(ctx, "unix", addr)
   291  		}))
   292  	if err != nil {
   293  		return err
   294  	}
   295  	defer conn.Close()
   296  	client := pluginapi.NewRegistrationClient(conn)
   297  	reqt := &pluginapi.RegisterRequest{
   298  		Version:      pluginapi.Version,
   299  		Endpoint:     filepath.Base(m.socket),
   300  		ResourceName: resourceName,
   301  		Options: &pluginapi.DevicePluginOptions{
   302  			PreStartRequired:                m.preStartContainerFlag,
   303  			GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
   304  		},
   305  	}
   306  
   307  	_, err = client.Register(context.Background(), reqt)
   308  	if err != nil {
   309  		// Stop server
   310  		m.server.Stop()
   311  		klog.ErrorS(err, "Client unable to register to kubelet")
   312  		return err
   313  	}
   314  	klog.InfoS("Device Plugin registered with the Kubelet")
   315  	return err
   316  }
   317  
   318  // GetDevicePluginOptions returns DevicePluginOptions settings for the device plugin.
   319  func (m *Stub) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
   320  	options := &pluginapi.DevicePluginOptions{
   321  		PreStartRequired:                m.preStartContainerFlag,
   322  		GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
   323  	}
   324  	return options, nil
   325  }
   326  
   327  // PreStartContainer resets the devices received
   328  func (m *Stub) PreStartContainer(ctx context.Context, r *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
   329  	klog.InfoS("PreStartContainer", "request", r)
   330  	return &pluginapi.PreStartContainerResponse{}, nil
   331  }
   332  
   333  // ListAndWatch lists devices and update that list according to the Update call
   334  func (m *Stub) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
   335  	klog.InfoS("ListAndWatch")
   336  
   337  	s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
   338  
   339  	for {
   340  		select {
   341  		case <-m.stop:
   342  			return nil
   343  		case updated := <-m.update:
   344  			s.Send(&pluginapi.ListAndWatchResponse{Devices: updated})
   345  		}
   346  	}
   347  }
   348  
   349  // Update allows the device plugin to send new devices through ListAndWatch
   350  func (m *Stub) Update(devs []*pluginapi.Device) {
   351  	m.update <- devs
   352  }
   353  
   354  // GetPreferredAllocation gets the preferred allocation from a set of available devices
   355  func (m *Stub) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
   356  	klog.InfoS("GetPreferredAllocation", "request", r)
   357  
   358  	devs := make(map[string]pluginapi.Device)
   359  
   360  	for _, dev := range m.devs {
   361  		devs[dev.ID] = *dev
   362  	}
   363  
   364  	return m.getPreferredAllocFunc(r, devs)
   365  }
   366  
   367  // Allocate does a mock allocation
   368  func (m *Stub) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
   369  	klog.InfoS("Allocate", "request", r)
   370  
   371  	devs := make(map[string]pluginapi.Device)
   372  
   373  	for _, dev := range m.devs {
   374  		devs[dev.ID] = *dev
   375  	}
   376  
   377  	return m.allocFunc(r, devs)
   378  }
   379  
   380  func (m *Stub) cleanup() error {
   381  	if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
   382  		return err
   383  	}
   384  
   385  	return nil
   386  }