github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/device.go (about) 1 package nvidia 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" 12 "github.com/hashicorp/nomad/helper/pluginutils/loader" 13 "github.com/hashicorp/nomad/plugins/base" 14 "github.com/hashicorp/nomad/plugins/device" 15 "github.com/hashicorp/nomad/plugins/shared/hclspec" 16 ) 17 18 const ( 19 // pluginName is the name of the plugin 20 pluginName = "nvidia-gpu" 21 22 // vendor is the vendor providing the devices 23 vendor = "nvidia" 24 25 // deviceType is the type of device being returned 26 deviceType = device.DeviceTypeGPU 27 28 // notAvailable value is returned to nomad server in case some properties were 29 // undetected by nvml driver 30 notAvailable = "N/A" 31 ) 32 33 const ( 34 // Nvidia-container-runtime environment variable names 35 NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" 36 ) 37 38 var ( 39 // PluginID is the nvidia plugin metadata registered in the plugin 40 // catalog. 41 PluginID = loader.PluginID{ 42 Name: pluginName, 43 PluginType: base.PluginTypeDevice, 44 } 45 46 // PluginConfig is the nvidia factory function registered in the 47 // plugin catalog. 48 PluginConfig = &loader.InternalPluginConfig{ 49 Factory: func(l log.Logger) interface{} { return NewNvidiaDevice(l) }, 50 } 51 52 // pluginInfo describes the plugin 53 pluginInfo = &base.PluginInfoResponse{ 54 Type: base.PluginTypeDevice, 55 PluginApiVersions: []string{device.ApiVersion010}, 56 PluginVersion: "0.1.0", 57 Name: pluginName, 58 } 59 60 // configSpec is the specification of the plugin's configuration 61 configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ 62 "ignored_gpu_ids": hclspec.NewDefault( 63 hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), 64 hclspec.NewLiteral("[]"), 65 ), 66 "fingerprint_period": hclspec.NewDefault( 67 hclspec.NewAttr("fingerprint_period", "string", false), 68 hclspec.NewLiteral("\"1m\""), 69 ), 70 }) 71 ) 72 73 // Config contains configuration information for the plugin. 74 type Config struct { 75 IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` 76 FingerprintPeriod string `codec:"fingerprint_period"` 77 } 78 79 // NvidiaDevice contains all plugin specific data 80 type NvidiaDevice struct { 81 // nvmlClient is used to get data from nvidia 82 nvmlClient nvml.NvmlClient 83 84 // initErr holds an error retrieved during 85 // nvmlClient initialization 86 initErr error 87 88 // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad 89 ignoredGPUIDs map[string]struct{} 90 91 // fingerprintPeriod is how often we should call nvml to get list of devices 92 fingerprintPeriod time.Duration 93 94 // devices is the set of detected eligible devices 95 devices map[string]struct{} 96 deviceLock sync.RWMutex 97 98 logger log.Logger 99 } 100 101 // NewNvidiaDevice returns a new nvidia device plugin. 102 func NewNvidiaDevice(log log.Logger) *NvidiaDevice { 103 nvmlClient, err := nvml.NewNvmlClient() 104 logger := log.Named(pluginName) 105 if err != nil && err.Error() != nvml.UnavailableLib.Error() { 106 logger.Error("unable to initialize Nvidia driver", "reason", err) 107 } 108 return &NvidiaDevice{ 109 logger: logger, 110 devices: make(map[string]struct{}), 111 ignoredGPUIDs: make(map[string]struct{}), 112 nvmlClient: nvmlClient, 113 initErr: err, 114 } 115 } 116 117 // PluginInfo returns information describing the plugin. 118 func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { 119 return pluginInfo, nil 120 } 121 122 // ConfigSchema returns the plugins configuration schema. 123 func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { 124 return configSpec, nil 125 } 126 127 // SetConfig is used to set the configuration of the plugin. 128 func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { 129 var config Config 130 if len(cfg.PluginConfig) != 0 { 131 if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil { 132 return err 133 } 134 } 135 136 for _, ignoredGPUId := range config.IgnoredGPUIDs { 137 d.ignoredGPUIDs[ignoredGPUId] = struct{}{} 138 } 139 140 period, err := time.ParseDuration(config.FingerprintPeriod) 141 if err != nil { 142 return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) 143 } 144 d.fingerprintPeriod = period 145 146 return nil 147 } 148 149 // Fingerprint streams detected devices. If device changes are detected or the 150 // devices health changes, messages will be emitted. 151 func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { 152 outCh := make(chan *device.FingerprintResponse) 153 go d.fingerprint(ctx, outCh) 154 return outCh, nil 155 } 156 157 type reservationError struct { 158 notExistingIDs []string 159 } 160 161 func (e *reservationError) Error() string { 162 return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) 163 } 164 165 // Reserve returns information on how to mount given devices. 166 // Assumption is made that nomad server is responsible for correctness of 167 // GPU allocations, handling tricky cases such as double-allocation of single GPU 168 func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { 169 if len(deviceIDs) == 0 { 170 return &device.ContainerReservation{}, nil 171 } 172 // Due to the asynchronous nature of NvidiaPlugin, there is a possibility 173 // of race condition 174 // 175 // Timeline: 176 // 1 - fingerprint reports that GPU with id "1" is present 177 // 2 - the following events happen at the same time: 178 // a) server decides to allocate GPU with id "1" 179 // b) fingerprint check reports that GPU with id "1" is no more present 180 // 181 // The latest and always valid version of fingerprinted ids are stored in 182 // d.devices map. To avoid this race condition an error is returned if 183 // any of provided deviceIDs is not found in d.devices map 184 d.deviceLock.RLock() 185 var notExistingIDs []string 186 for _, id := range deviceIDs { 187 if _, deviceIDExists := d.devices[id]; !deviceIDExists { 188 notExistingIDs = append(notExistingIDs, id) 189 } 190 } 191 d.deviceLock.RUnlock() 192 if len(notExistingIDs) != 0 { 193 return nil, &reservationError{notExistingIDs} 194 } 195 196 return &device.ContainerReservation{ 197 Envs: map[string]string{ 198 NvidiaVisibleDevices: strings.Join(deviceIDs, ","), 199 }, 200 }, nil 201 } 202 203 // Stats streams statistics for the detected devices. 204 func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { 205 outCh := make(chan *device.StatsResponse) 206 go d.stats(ctx, outCh, interval) 207 return outCh, nil 208 }