github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/device.go (about)

     1  package nvidia
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
    12  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    13  	"github.com/hashicorp/nomad/plugins/base"
    14  	"github.com/hashicorp/nomad/plugins/device"
    15  	"github.com/hashicorp/nomad/plugins/shared/hclspec"
    16  )
    17  
    18  const (
    19  	// pluginName is the name of the plugin
    20  	pluginName = "nvidia-gpu"
    21  
    22  	// vendor is the vendor providing the devices
    23  	vendor = "nvidia"
    24  
    25  	// deviceType is the type of device being returned
    26  	deviceType = device.DeviceTypeGPU
    27  
    28  	// notAvailable value is returned to nomad server in case some properties were
    29  	// undetected by nvml driver
    30  	notAvailable = "N/A"
    31  )
    32  
    33  const (
    34  	// Nvidia-container-runtime environment variable names
    35  	NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
    36  )
    37  
    38  var (
    39  	// PluginID is the nvidia plugin metadata registered in the plugin
    40  	// catalog.
    41  	PluginID = loader.PluginID{
    42  		Name:       pluginName,
    43  		PluginType: base.PluginTypeDevice,
    44  	}
    45  
    46  	// PluginConfig is the nvidia factory function registered in the
    47  	// plugin catalog.
    48  	PluginConfig = &loader.InternalPluginConfig{
    49  		Factory: func(l log.Logger) interface{} { return NewNvidiaDevice(l) },
    50  	}
    51  
    52  	// pluginInfo describes the plugin
    53  	pluginInfo = &base.PluginInfoResponse{
    54  		Type:              base.PluginTypeDevice,
    55  		PluginApiVersions: []string{device.ApiVersion010},
    56  		PluginVersion:     "0.1.0",
    57  		Name:              pluginName,
    58  	}
    59  
    60  	// configSpec is the specification of the plugin's configuration
    61  	configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
    62  		"ignored_gpu_ids": hclspec.NewDefault(
    63  			hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
    64  			hclspec.NewLiteral("[]"),
    65  		),
    66  		"fingerprint_period": hclspec.NewDefault(
    67  			hclspec.NewAttr("fingerprint_period", "string", false),
    68  			hclspec.NewLiteral("\"1m\""),
    69  		),
    70  	})
    71  )
    72  
    73  // Config contains configuration information for the plugin.
    74  type Config struct {
    75  	IgnoredGPUIDs     []string `codec:"ignored_gpu_ids"`
    76  	FingerprintPeriod string   `codec:"fingerprint_period"`
    77  }
    78  
    79  // NvidiaDevice contains all plugin specific data
    80  type NvidiaDevice struct {
    81  	// nvmlClient is used to get data from nvidia
    82  	nvmlClient nvml.NvmlClient
    83  
    84  	// initErr holds an error retrieved during
    85  	// nvmlClient initialization
    86  	initErr error
    87  
    88  	// ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
    89  	ignoredGPUIDs map[string]struct{}
    90  
    91  	// fingerprintPeriod is how often we should call nvml to get list of devices
    92  	fingerprintPeriod time.Duration
    93  
    94  	// devices is the set of detected eligible devices
    95  	devices    map[string]struct{}
    96  	deviceLock sync.RWMutex
    97  
    98  	logger log.Logger
    99  }
   100  
   101  // NewNvidiaDevice returns a new nvidia device plugin.
   102  func NewNvidiaDevice(log log.Logger) *NvidiaDevice {
   103  	nvmlClient, err := nvml.NewNvmlClient()
   104  	logger := log.Named(pluginName)
   105  	if err != nil && err.Error() != nvml.UnavailableLib.Error() {
   106  		logger.Error("unable to initialize Nvidia driver", "reason", err)
   107  	}
   108  	return &NvidiaDevice{
   109  		logger:        logger,
   110  		devices:       make(map[string]struct{}),
   111  		ignoredGPUIDs: make(map[string]struct{}),
   112  		nvmlClient:    nvmlClient,
   113  		initErr:       err,
   114  	}
   115  }
   116  
   117  // PluginInfo returns information describing the plugin.
   118  func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
   119  	return pluginInfo, nil
   120  }
   121  
   122  // ConfigSchema returns the plugins configuration schema.
   123  func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
   124  	return configSpec, nil
   125  }
   126  
   127  // SetConfig is used to set the configuration of the plugin.
   128  func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
   129  	var config Config
   130  	if len(cfg.PluginConfig) != 0 {
   131  		if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil {
   132  			return err
   133  		}
   134  	}
   135  
   136  	for _, ignoredGPUId := range config.IgnoredGPUIDs {
   137  		d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
   138  	}
   139  
   140  	period, err := time.ParseDuration(config.FingerprintPeriod)
   141  	if err != nil {
   142  		return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
   143  	}
   144  	d.fingerprintPeriod = period
   145  
   146  	return nil
   147  }
   148  
   149  // Fingerprint streams detected devices. If device changes are detected or the
   150  // devices health changes, messages will be emitted.
   151  func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
   152  	outCh := make(chan *device.FingerprintResponse)
   153  	go d.fingerprint(ctx, outCh)
   154  	return outCh, nil
   155  }
   156  
   157  type reservationError struct {
   158  	notExistingIDs []string
   159  }
   160  
   161  func (e *reservationError) Error() string {
   162  	return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
   163  }
   164  
   165  // Reserve returns information on how to mount given devices.
   166  // Assumption is made that nomad server is responsible for correctness of
   167  // GPU allocations, handling tricky cases such as double-allocation of single GPU
   168  func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
   169  	if len(deviceIDs) == 0 {
   170  		return &device.ContainerReservation{}, nil
   171  	}
   172  	// Due to the asynchronous nature of NvidiaPlugin, there is a possibility
   173  	// of race condition
   174  	//
   175  	// Timeline:
   176  	// 	1 - fingerprint reports that GPU with id "1" is present
   177  	//  2 - the following events happen at the same time:
   178  	// 		a) server decides to allocate GPU with id "1"
   179  	//      b) fingerprint check reports that GPU with id "1" is no more present
   180  	//
   181  	// The latest and always valid version of fingerprinted ids are stored in
   182  	// d.devices map. To avoid this race condition an error is returned if
   183  	// any of provided deviceIDs is not found in d.devices map
   184  	d.deviceLock.RLock()
   185  	var notExistingIDs []string
   186  	for _, id := range deviceIDs {
   187  		if _, deviceIDExists := d.devices[id]; !deviceIDExists {
   188  			notExistingIDs = append(notExistingIDs, id)
   189  		}
   190  	}
   191  	d.deviceLock.RUnlock()
   192  	if len(notExistingIDs) != 0 {
   193  		return nil, &reservationError{notExistingIDs}
   194  	}
   195  
   196  	return &device.ContainerReservation{
   197  		Envs: map[string]string{
   198  			NvidiaVisibleDevices: strings.Join(deviceIDs, ","),
   199  		},
   200  	}, nil
   201  }
   202  
   203  // Stats streams statistics for the detected devices.
   204  func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) {
   205  	outCh := make(chan *device.StatsResponse)
   206  	go d.stats(ctx, outCh, interval)
   207  	return outCh, nil
   208  }