github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/devices/gpu/nvidia/device.go (about)

     1  package nvidia
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
    12  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    13  	"github.com/hashicorp/nomad/plugins/base"
    14  	"github.com/hashicorp/nomad/plugins/device"
    15  	"github.com/hashicorp/nomad/plugins/shared/hclspec"
    16  )
    17  
    18  const (
    19  	// pluginName is the name of the plugin
    20  	pluginName = "nvidia-gpu"
    21  
    22  	// vendor is the vendor providing the devices
    23  	vendor = "nvidia"
    24  
    25  	// deviceType is the type of device being returned
    26  	deviceType = device.DeviceTypeGPU
    27  
    28  	// notAvailable value is returned to nomad server in case some properties were
    29  	// undetected by nvml driver
    30  	notAvailable = "N/A"
    31  
    32  	// Nvidia-container-runtime environment variable names
    33  	NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
    34  )
    35  
    36  var (
    37  	// PluginID is the nvidia plugin metadata registered in the plugin
    38  	// catalog.
    39  	PluginID = loader.PluginID{
    40  		Name:       pluginName,
    41  		PluginType: base.PluginTypeDevice,
    42  	}
    43  
    44  	// PluginConfig is the nvidia factory function registered in the
    45  	// plugin catalog.
    46  	PluginConfig = &loader.InternalPluginConfig{
    47  		Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) },
    48  	}
    49  
    50  	// pluginInfo describes the plugin
    51  	pluginInfo = &base.PluginInfoResponse{
    52  		Type:              base.PluginTypeDevice,
    53  		PluginApiVersions: []string{device.ApiVersion010},
    54  		PluginVersion:     "0.1.0",
    55  		Name:              pluginName,
    56  	}
    57  
    58  	// configSpec is the specification of the plugin's configuration
    59  	configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
    60  		"enabled": hclspec.NewDefault(
    61  			hclspec.NewAttr("enabled", "bool", false),
    62  			hclspec.NewLiteral("true"),
    63  		),
    64  		"ignored_gpu_ids": hclspec.NewDefault(
    65  			hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
    66  			hclspec.NewLiteral("[]"),
    67  		),
    68  		"fingerprint_period": hclspec.NewDefault(
    69  			hclspec.NewAttr("fingerprint_period", "string", false),
    70  			hclspec.NewLiteral("\"1m\""),
    71  		),
    72  	})
    73  )
    74  
    75  // Config contains configuration information for the plugin.
    76  type Config struct {
    77  	Enabled           bool     `codec:"enabled"`
    78  	IgnoredGPUIDs     []string `codec:"ignored_gpu_ids"`
    79  	FingerprintPeriod string   `codec:"fingerprint_period"`
    80  }
    81  
    82  // NvidiaDevice contains all plugin specific data
    83  type NvidiaDevice struct {
    84  	// enabled indicates whether the plugin should be enabled
    85  	enabled bool
    86  
    87  	// nvmlClient is used to get data from nvidia
    88  	nvmlClient nvml.NvmlClient
    89  
    90  	// initErr holds an error retrieved during
    91  	// nvmlClient initialization
    92  	initErr error
    93  
    94  	// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
    95  	ignoredGPUIDs map[string]struct{}
    96  
    97  	// fingerprintPeriod is how often we should call nvml to get list of devices
    98  	fingerprintPeriod time.Duration
    99  
   100  	// devices is the set of detected eligible devices
   101  	devices    map[string]struct{}
   102  	deviceLock sync.RWMutex
   103  
   104  	logger log.Logger
   105  }
   106  
   107  // NewNvidiaDevice returns a new nvidia device plugin.
   108  func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice {
   109  	nvmlClient, err := nvml.NewNvmlClient()
   110  	logger := log.Named(pluginName)
   111  	if err != nil && err.Error() != nvml.UnavailableLib.Error() {
   112  		logger.Error("unable to initialize Nvidia driver", "reason", err)
   113  	}
   114  	return &NvidiaDevice{
   115  		logger:        logger,
   116  		devices:       make(map[string]struct{}),
   117  		ignoredGPUIDs: make(map[string]struct{}),
   118  		nvmlClient:    nvmlClient,
   119  		initErr:       err,
   120  	}
   121  }
   122  
   123  // PluginInfo returns information describing the plugin.
   124  func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
   125  	return pluginInfo, nil
   126  }
   127  
   128  // ConfigSchema returns the plugins configuration schema.
   129  func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
   130  	return configSpec, nil
   131  }
   132  
   133  // SetConfig is used to set the configuration of the plugin.
   134  func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
   135  	var config Config
   136  	if len(cfg.PluginConfig) != 0 {
   137  		if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil {
   138  			return err
   139  		}
   140  	}
   141  
   142  	d.enabled = config.Enabled
   143  
   144  	for _, ignoredGPUId := range config.IgnoredGPUIDs {
   145  		d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
   146  	}
   147  
   148  	period, err := time.ParseDuration(config.FingerprintPeriod)
   149  	if err != nil {
   150  		return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
   151  	}
   152  	d.fingerprintPeriod = period
   153  
   154  	return nil
   155  }
   156  
   157  // Fingerprint streams detected devices. If device changes are detected or the
   158  // devices health changes, messages will be emitted.
   159  func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
   160  	if !d.enabled {
   161  		return nil, device.ErrPluginDisabled
   162  	}
   163  
   164  	outCh := make(chan *device.FingerprintResponse)
   165  	go d.fingerprint(ctx, outCh)
   166  	return outCh, nil
   167  }
   168  
   169  type reservationError struct {
   170  	notExistingIDs []string
   171  }
   172  
   173  func (e *reservationError) Error() string {
   174  	return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
   175  }
   176  
   177  // Reserve returns information on how to mount given devices.
   178  // Assumption is made that nomad server is responsible for correctness of
   179  // GPU allocations, handling tricky cases such as double-allocation of single GPU
   180  func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
   181  	if len(deviceIDs) == 0 {
   182  		return &device.ContainerReservation{}, nil
   183  	}
   184  	if !d.enabled {
   185  		return nil, device.ErrPluginDisabled
   186  	}
   187  
   188  	// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
   189  	// of race condition
   190  	//
   191  	// Timeline:
   192  	// 	1 - fingerprint reports that GPU with id "1" is present
   193  	//  2 - the following events happen at the same time:
   194  	// 		a) server decides to allocate GPU with id "1"
   195  	//      b) fingerprint check reports that GPU with id "1" is no more present
   196  	//
   197  	// The latest and always valid version of fingerprinted ids are stored in
   198  	// d.devices map. To avoid this race condition an error is returned if
   199  	// any of provided deviceIDs is not found in d.devices map
   200  	d.deviceLock.RLock()
   201  	var notExistingIDs []string
   202  	for _, id := range deviceIDs {
   203  		if _, deviceIDExists := d.devices[id]; !deviceIDExists {
   204  			notExistingIDs = append(notExistingIDs, id)
   205  		}
   206  	}
   207  	d.deviceLock.RUnlock()
   208  	if len(notExistingIDs) != 0 {
   209  		return nil, &reservationError{notExistingIDs}
   210  	}
   211  
   212  	return &device.ContainerReservation{
   213  		Envs: map[string]string{
   214  			NvidiaVisibleDevices: strings.Join(deviceIDs, ","),
   215  		},
   216  	}, nil
   217  }
   218  
   219  // Stats streams statistics for the detected devices.
   220  func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) {
   221  	if !d.enabled {
   222  		return nil, device.ErrPluginDisabled
   223  	}
   224  
   225  	outCh := make(chan *device.StatsResponse)
   226  	go d.stats(ctx, outCh, interval)
   227  	return outCh, nil
   228  }