github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/devices/gpu/nvidia/device.go (about) 1 package nvidia 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" 12 "github.com/hashicorp/nomad/helper/pluginutils/loader" 13 "github.com/hashicorp/nomad/plugins/base" 14 "github.com/hashicorp/nomad/plugins/device" 15 "github.com/hashicorp/nomad/plugins/shared/hclspec" 16 ) 17 18 const ( 19 // pluginName is the name of the plugin 20 pluginName = "nvidia-gpu" 21 22 // vendor is the vendor providing the devices 23 vendor = "nvidia" 24 25 // deviceType is the type of device being returned 26 deviceType = device.DeviceTypeGPU 27 28 // notAvailable value is returned to nomad server in case some properties were 29 // undetected by nvml driver 30 notAvailable = "N/A" 31 32 // Nvidia-container-runtime environment variable names 33 NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" 34 ) 35 36 var ( 37 // PluginID is the nvidia plugin metadata registered in the plugin 38 // catalog. 39 PluginID = loader.PluginID{ 40 Name: pluginName, 41 PluginType: base.PluginTypeDevice, 42 } 43 44 // PluginConfig is the nvidia factory function registered in the 45 // plugin catalog. 46 PluginConfig = &loader.InternalPluginConfig{ 47 Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) }, 48 } 49 50 // pluginInfo describes the plugin 51 pluginInfo = &base.PluginInfoResponse{ 52 Type: base.PluginTypeDevice, 53 PluginApiVersions: []string{device.ApiVersion010}, 54 PluginVersion: "0.1.0", 55 Name: pluginName, 56 } 57 58 // configSpec is the specification of the plugin's configuration 59 configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ 60 "enabled": hclspec.NewDefault( 61 hclspec.NewAttr("enabled", "bool", false), 62 hclspec.NewLiteral("true"), 63 ), 64 "ignored_gpu_ids": hclspec.NewDefault( 65 hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), 66 hclspec.NewLiteral("[]"), 67 ), 68 "fingerprint_period": hclspec.NewDefault( 69 hclspec.NewAttr("fingerprint_period", "string", false), 70 hclspec.NewLiteral("\"1m\""), 71 ), 72 }) 73 ) 74 75 // Config contains configuration information for the plugin. 76 type Config struct { 77 Enabled bool `codec:"enabled"` 78 IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` 79 FingerprintPeriod string `codec:"fingerprint_period"` 80 } 81 82 // NvidiaDevice contains all plugin specific data 83 type NvidiaDevice struct { 84 // enabled indicates whether the plugin should be enabled 85 enabled bool 86 87 // nvmlClient is used to get data from nvidia 88 nvmlClient nvml.NvmlClient 89 90 // initErr holds an error retrieved during 91 // nvmlClient initialization 92 initErr error 93 94 // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad 95 ignoredGPUIDs map[string]struct{} 96 97 // fingerprintPeriod is how often we should call nvml to get list of devices 98 fingerprintPeriod time.Duration 99 100 // devices is the set of detected eligible devices 101 devices map[string]struct{} 102 deviceLock sync.RWMutex 103 104 logger log.Logger 105 } 106 107 // NewNvidiaDevice returns a new nvidia device plugin. 108 func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice { 109 nvmlClient, err := nvml.NewNvmlClient() 110 logger := log.Named(pluginName) 111 if err != nil && err.Error() != nvml.UnavailableLib.Error() { 112 logger.Error("unable to initialize Nvidia driver", "reason", err) 113 } 114 return &NvidiaDevice{ 115 logger: logger, 116 devices: make(map[string]struct{}), 117 ignoredGPUIDs: make(map[string]struct{}), 118 nvmlClient: nvmlClient, 119 initErr: err, 120 } 121 } 122 123 // PluginInfo returns information describing the plugin. 124 func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { 125 return pluginInfo, nil 126 } 127 128 // ConfigSchema returns the plugins configuration schema. 129 func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { 130 return configSpec, nil 131 } 132 133 // SetConfig is used to set the configuration of the plugin. 134 func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { 135 var config Config 136 if len(cfg.PluginConfig) != 0 { 137 if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil { 138 return err 139 } 140 } 141 142 d.enabled = config.Enabled 143 144 for _, ignoredGPUId := range config.IgnoredGPUIDs { 145 d.ignoredGPUIDs[ignoredGPUId] = struct{}{} 146 } 147 148 period, err := time.ParseDuration(config.FingerprintPeriod) 149 if err != nil { 150 return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) 151 } 152 d.fingerprintPeriod = period 153 154 return nil 155 } 156 157 // Fingerprint streams detected devices. If device changes are detected or the 158 // devices health changes, messages will be emitted. 159 func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { 160 if !d.enabled { 161 return nil, device.ErrPluginDisabled 162 } 163 164 outCh := make(chan *device.FingerprintResponse) 165 go d.fingerprint(ctx, outCh) 166 return outCh, nil 167 } 168 169 type reservationError struct { 170 notExistingIDs []string 171 } 172 173 func (e *reservationError) Error() string { 174 return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) 175 } 176 177 // Reserve returns information on how to mount given devices. 178 // Assumption is made that nomad server is responsible for correctness of 179 // GPU allocations, handling tricky cases such as double-allocation of single GPU 180 func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { 181 if len(deviceIDs) == 0 { 182 return &device.ContainerReservation{}, nil 183 } 184 if !d.enabled { 185 return nil, device.ErrPluginDisabled 186 } 187 188 // Due to the asynchronous nature of NvidiaPlugin, there is a possibility 189 // of race condition 190 // 191 // Timeline: 192 // 1 - fingerprint reports that GPU with id "1" is present 193 // 2 - the following events happen at the same time: 194 // a) server decides to allocate GPU with id "1" 195 // b) fingerprint check reports that GPU with id "1" is no more present 196 // 197 // The latest and always valid version of fingerprinted ids are stored in 198 // d.devices map. To avoid this race condition an error is returned if 199 // any of provided deviceIDs is not found in d.devices map 200 d.deviceLock.RLock() 201 var notExistingIDs []string 202 for _, id := range deviceIDs { 203 if _, deviceIDExists := d.devices[id]; !deviceIDExists { 204 notExistingIDs = append(notExistingIDs, id) 205 } 206 } 207 d.deviceLock.RUnlock() 208 if len(notExistingIDs) != 0 { 209 return nil, &reservationError{notExistingIDs} 210 } 211 212 return &device.ContainerReservation{ 213 Envs: map[string]string{ 214 NvidiaVisibleDevices: strings.Join(deviceIDs, ","), 215 }, 216 }, nil 217 } 218 219 // Stats streams statistics for the detected devices. 220 func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { 221 if !d.enabled { 222 return nil, device.ErrPluginDisabled 223 } 224 225 outCh := make(chan *device.StatsResponse) 226 go d.stats(ctx, outCh, interval) 227 return outCh, nil 228 }