github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/devicemanager/manager_test.go (about) 1 package devicemanager 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "testing" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 plugin "github.com/hashicorp/go-plugin" 12 "github.com/hashicorp/nomad/client/state" 13 "github.com/hashicorp/nomad/helper" 14 "github.com/hashicorp/nomad/helper/pluginutils/loader" 15 "github.com/hashicorp/nomad/helper/testlog" 16 "github.com/hashicorp/nomad/helper/uuid" 17 "github.com/hashicorp/nomad/nomad/structs" 18 "github.com/hashicorp/nomad/plugins/base" 19 "github.com/hashicorp/nomad/plugins/device" 20 psstructs "github.com/hashicorp/nomad/plugins/shared/structs" 21 "github.com/hashicorp/nomad/testutil" 22 "github.com/stretchr/testify/require" 23 ) 24 25 var ( 26 nvidiaDevice0ID = uuid.Generate() 27 nvidiaDevice1ID = uuid.Generate() 28 nvidiaDeviceGroup = &device.DeviceGroup{ 29 Vendor: "nvidia", 30 Type: "gpu", 31 Name: "1080ti", 32 Devices: []*device.Device{ 33 { 34 ID: nvidiaDevice0ID, 35 Healthy: true, 36 }, 37 { 38 ID: nvidiaDevice1ID, 39 Healthy: true, 40 }, 41 }, 42 Attributes: map[string]*psstructs.Attribute{ 43 "memory": { 44 Int: helper.Int64ToPtr(4), 45 Unit: "GB", 46 }, 47 }, 48 } 49 50 intelDeviceID = uuid.Generate() 51 intelDeviceGroup = &device.DeviceGroup{ 52 Vendor: "intel", 53 Type: "gpu", 54 Name: "640GT", 55 Devices: []*device.Device{ 56 { 57 ID: intelDeviceID, 58 Healthy: true, 59 }, 60 }, 61 Attributes: map[string]*psstructs.Attribute{ 62 "memory": { 63 Int: helper.Int64ToPtr(2), 64 Unit: "GB", 65 }, 66 }, 67 } 68 69 nvidiaDeviceGroupStats = &device.DeviceGroupStats{ 70 Vendor: "nvidia", 71 Type: "gpu", 72 Name: "1080ti", 73 InstanceStats: map[string]*device.DeviceStats{ 74 nvidiaDevice0ID: { 75 Summary: &psstructs.StatValue{ 76 IntNumeratorVal: helper.Int64ToPtr(212), 77 Unit: "F", 78 Desc: "Temperature", 79 }, 80 }, 81 nvidiaDevice1ID: { 82 Summary: &psstructs.StatValue{ 83 IntNumeratorVal: helper.Int64ToPtr(218), 84 Unit: "F", 85 Desc: "Temperature", 86 }, 87 }, 88 }, 89 } 90 91 intelDeviceGroupStats = &device.DeviceGroupStats{ 92 Vendor: "intel", 93 Type: "gpu", 94 Name: "640GT", 95 InstanceStats: map[string]*device.DeviceStats{ 96 intelDeviceID: { 97 Summary: &psstructs.StatValue{ 98 IntNumeratorVal: helper.Int64ToPtr(220), 99 Unit: "F", 100 Desc: "Temperature", 101 }, 102 }, 103 }, 104 } 105 ) 106 107 func baseTestConfig(t *testing.T) ( 108 config *Config, 109 deviceUpdateCh chan []*structs.NodeDeviceResource, 110 catalog *loader.MockCatalog) { 111 112 // Create an update handler 113 deviceUpdates := make(chan []*structs.NodeDeviceResource, 1) 114 updateFn := func(devices []*structs.NodeDeviceResource) { 115 deviceUpdates <- devices 116 } 117 118 // Create a mock plugin catalog 119 mc := &loader.MockCatalog{} 120 121 // Create the config 122 logger := testlog.HCLogger(t) 123 config = &Config{ 124 Logger: logger, 125 PluginConfig: &base.AgentConfig{}, 126 StatsInterval: 100 * time.Millisecond, 127 State: state.NewMemDB(logger), 128 Updater: updateFn, 129 Loader: mc, 130 } 131 132 return config, deviceUpdates, mc 133 } 134 135 func configureCatalogWith(catalog *loader.MockCatalog, plugins map[*base.PluginInfoResponse]loader.PluginInstance) { 136 137 catalog.DispenseF = func(name, _ string, _ *base.AgentConfig, _ log.Logger) (loader.PluginInstance, error) { 138 for info, v := range plugins { 139 if info.Name == name { 140 return v, nil 141 } 142 } 143 144 return nil, fmt.Errorf("no matching plugin") 145 } 146 147 catalog.ReattachF = func(name, _ string, _ *plugin.ReattachConfig) (loader.PluginInstance, error) { 148 for info, v := range plugins { 149 if info.Name == name { 150 return v, nil 151 } 152 } 153 154 return nil, fmt.Errorf("no matching plugin") 155 } 156 157 catalog.CatalogF = func() map[string][]*base.PluginInfoResponse { 158 devices := make([]*base.PluginInfoResponse, 0, len(plugins)) 159 for k := range plugins { 160 devices = append(devices, k) 161 } 162 out := map[string][]*base.PluginInfoResponse{ 163 base.PluginTypeDevice: devices, 164 } 165 return out 166 } 167 } 168 169 func pluginInfoResponse(name string) *base.PluginInfoResponse { 170 return &base.PluginInfoResponse{ 171 Type: base.PluginTypeDevice, 172 PluginApiVersions: []string{"v0.0.1"}, 173 PluginVersion: "v0.0.1", 174 Name: name, 175 } 176 } 177 178 // drainNodeDeviceUpdates drains all updates to the node device fingerprint channel 179 func drainNodeDeviceUpdates(ctx context.Context, in chan []*structs.NodeDeviceResource) { 180 go func() { 181 for { 182 select { 183 case <-ctx.Done(): 184 return 185 case <-in: 186 } 187 } 188 }() 189 } 190 191 func deviceReserveFn(ids []string) (*device.ContainerReservation, error) { 192 return &device.ContainerReservation{ 193 Envs: map[string]string{ 194 "DEVICES": strings.Join(ids, ","), 195 }, 196 }, nil 197 } 198 199 // nvidiaAndIntelDefaultPlugins adds an nvidia and intel mock plugin to the 200 // catalog 201 func nvidiaAndIntelDefaultPlugins(catalog *loader.MockCatalog) { 202 pluginInfoNvidia := pluginInfoResponse("nvidia") 203 deviceNvidia := &device.MockDevicePlugin{ 204 MockPlugin: &base.MockPlugin{ 205 PluginInfoF: base.StaticInfo(pluginInfoNvidia), 206 ConfigSchemaF: base.TestConfigSchema(), 207 SetConfigF: base.NoopSetConfig(), 208 }, 209 FingerprintF: device.StaticFingerprinter([]*device.DeviceGroup{nvidiaDeviceGroup}), 210 ReserveF: deviceReserveFn, 211 StatsF: device.StaticStats([]*device.DeviceGroupStats{nvidiaDeviceGroupStats}), 212 } 213 pluginNvidia := loader.MockBasicExternalPlugin(deviceNvidia, device.ApiVersion010) 214 215 pluginInfoIntel := pluginInfoResponse("intel") 216 deviceIntel := &device.MockDevicePlugin{ 217 MockPlugin: &base.MockPlugin{ 218 PluginInfoF: base.StaticInfo(pluginInfoIntel), 219 ConfigSchemaF: base.TestConfigSchema(), 220 SetConfigF: base.NoopSetConfig(), 221 }, 222 FingerprintF: device.StaticFingerprinter([]*device.DeviceGroup{intelDeviceGroup}), 223 ReserveF: deviceReserveFn, 224 StatsF: device.StaticStats([]*device.DeviceGroupStats{intelDeviceGroupStats}), 225 } 226 pluginIntel := loader.MockBasicExternalPlugin(deviceIntel, device.ApiVersion010) 227 228 // Configure the catalog with two plugins 229 configureCatalogWith(catalog, map[*base.PluginInfoResponse]loader.PluginInstance{ 230 pluginInfoNvidia: pluginNvidia, 231 pluginInfoIntel: pluginIntel, 232 }) 233 } 234 235 // Test collecting statistics from all devices 236 func TestManager_AllStats(t *testing.T) { 237 t.Parallel() 238 require := require.New(t) 239 240 config, _, catalog := baseTestConfig(t) 241 nvidiaAndIntelDefaultPlugins(catalog) 242 243 m := New(config) 244 m.Run() 245 defer m.Shutdown() 246 require.Len(m.instances, 2) 247 248 // Wait till we get a fingerprint result 249 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 250 defer cancel() 251 <-m.WaitForFirstFingerprint(ctx) 252 require.NoError(ctx.Err()) 253 254 // Now collect all the stats 255 var stats []*device.DeviceGroupStats 256 testutil.WaitForResult(func() (bool, error) { 257 stats = m.AllStats() 258 l := len(stats) 259 if l == 2 { 260 return true, nil 261 } 262 263 return false, fmt.Errorf("expected count 2; got %d", l) 264 }, func(err error) { 265 t.Fatal(err) 266 }) 267 268 // Check we got stats from both the devices 269 var nstats, istats bool 270 for _, stat := range stats { 271 switch stat.Vendor { 272 case "intel": 273 istats = true 274 case "nvidia": 275 nstats = true 276 default: 277 t.Fatalf("unexpected vendor %q", stat.Vendor) 278 } 279 } 280 require.True(nstats) 281 require.True(istats) 282 } 283 284 // Test collecting statistics from a particular device 285 func TestManager_DeviceStats(t *testing.T) { 286 t.Parallel() 287 require := require.New(t) 288 289 config, _, catalog := baseTestConfig(t) 290 nvidiaAndIntelDefaultPlugins(catalog) 291 292 m := New(config) 293 m.Run() 294 defer m.Shutdown() 295 296 // Wait till we get a fingerprint result 297 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 298 defer cancel() 299 <-m.WaitForFirstFingerprint(ctx) 300 require.NoError(ctx.Err()) 301 302 testutil.WaitForResult(func() (bool, error) { 303 stats := m.AllStats() 304 l := len(stats) 305 if l == 2 { 306 return true, nil 307 } 308 309 return false, fmt.Errorf("expected count 2; got %d", l) 310 }, func(err error) { 311 t.Fatal(err) 312 }) 313 314 // Now collect the stats for one nvidia device 315 stat, err := m.DeviceStats(&structs.AllocatedDeviceResource{ 316 Vendor: "nvidia", 317 Type: "gpu", 318 Name: "1080ti", 319 DeviceIDs: []string{nvidiaDevice1ID}, 320 }) 321 require.NoError(err) 322 require.NotNil(stat) 323 324 require.Len(stat.InstanceStats, 1) 325 require.Contains(stat.InstanceStats, nvidiaDevice1ID) 326 327 istat := stat.InstanceStats[nvidiaDevice1ID] 328 require.EqualValues(218, *istat.Summary.IntNumeratorVal) 329 } 330 331 // Test reserving a particular device 332 func TestManager_Reserve(t *testing.T) { 333 t.Parallel() 334 r := require.New(t) 335 336 config, _, catalog := baseTestConfig(t) 337 nvidiaAndIntelDefaultPlugins(catalog) 338 339 m := New(config) 340 m.Run() 341 defer m.Shutdown() 342 343 // Wait till we get a fingerprint result 344 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 345 defer cancel() 346 <-m.WaitForFirstFingerprint(ctx) 347 r.NoError(ctx.Err()) 348 349 cases := []struct { 350 in *structs.AllocatedDeviceResource 351 expected string 352 err bool 353 }{ 354 { 355 in: &structs.AllocatedDeviceResource{ 356 Vendor: "nvidia", 357 Type: "gpu", 358 Name: "1080ti", 359 DeviceIDs: []string{nvidiaDevice1ID}, 360 }, 361 expected: nvidiaDevice1ID, 362 }, 363 { 364 in: &structs.AllocatedDeviceResource{ 365 Vendor: "nvidia", 366 Type: "gpu", 367 Name: "1080ti", 368 DeviceIDs: []string{nvidiaDevice0ID}, 369 }, 370 expected: nvidiaDevice0ID, 371 }, 372 { 373 in: &structs.AllocatedDeviceResource{ 374 Vendor: "nvidia", 375 Type: "gpu", 376 Name: "1080ti", 377 DeviceIDs: []string{nvidiaDevice0ID, nvidiaDevice1ID}, 378 }, 379 expected: fmt.Sprintf("%s,%s", nvidiaDevice0ID, nvidiaDevice1ID), 380 }, 381 { 382 in: &structs.AllocatedDeviceResource{ 383 Vendor: "nvidia", 384 Type: "gpu", 385 Name: "1080ti", 386 DeviceIDs: []string{nvidiaDevice0ID, nvidiaDevice1ID, "foo"}, 387 }, 388 err: true, 389 }, 390 { 391 in: &structs.AllocatedDeviceResource{ 392 Vendor: "intel", 393 Type: "gpu", 394 Name: "640GT", 395 DeviceIDs: []string{intelDeviceID}, 396 }, 397 expected: intelDeviceID, 398 }, 399 { 400 in: &structs.AllocatedDeviceResource{ 401 Vendor: "intel", 402 Type: "gpu", 403 Name: "foo", 404 DeviceIDs: []string{intelDeviceID}, 405 }, 406 err: true, 407 }, 408 } 409 410 for i, c := range cases { 411 t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { 412 r = require.New(t) 413 414 // Reserve a particular device 415 res, err := m.Reserve(c.in) 416 if !c.err { 417 r.NoError(err) 418 r.NotNil(res) 419 420 r.Len(res.Envs, 1) 421 r.Equal(res.Envs["DEVICES"], c.expected) 422 } else { 423 r.Error(err) 424 } 425 }) 426 } 427 } 428 429 // Test that shutdown shutsdown the plugins 430 func TestManager_Shutdown(t *testing.T) { 431 t.Parallel() 432 require := require.New(t) 433 434 config, _, catalog := baseTestConfig(t) 435 nvidiaAndIntelDefaultPlugins(catalog) 436 437 m := New(config) 438 m.Run() 439 defer m.Shutdown() 440 441 // Wait till we get a fingerprint result 442 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 443 defer cancel() 444 <-m.WaitForFirstFingerprint(ctx) 445 require.NoError(ctx.Err()) 446 447 // Call shutdown and assert that we killed the plugins 448 m.Shutdown() 449 450 for _, resp := range catalog.Catalog()[base.PluginTypeDevice] { 451 pinst, _ := catalog.Dispense(resp.Name, resp.Type, &base.AgentConfig{}, config.Logger) 452 require.True(pinst.Exited()) 453 } 454 } 455 456 // Test that startup shutsdown previously launched plugins 457 func TestManager_Run_ShutdownOld(t *testing.T) { 458 t.Parallel() 459 require := require.New(t) 460 461 config, _, catalog := baseTestConfig(t) 462 nvidiaAndIntelDefaultPlugins(catalog) 463 464 m := New(config) 465 m.Run() 466 defer m.Shutdown() 467 468 // Wait till we get a fingerprint result 469 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 470 defer cancel() 471 <-m.WaitForFirstFingerprint(ctx) 472 require.NoError(ctx.Err()) 473 474 // Create a new manager with the same config so that it reads the old state 475 m2 := New(config) 476 go m2.Run() 477 defer m2.Shutdown() 478 479 testutil.WaitForResult(func() (bool, error) { 480 for _, resp := range catalog.Catalog()[base.PluginTypeDevice] { 481 pinst, _ := catalog.Dispense(resp.Name, resp.Type, &base.AgentConfig{}, config.Logger) 482 if !pinst.Exited() { 483 return false, fmt.Errorf("plugin %q not shutdown", resp.Name) 484 } 485 } 486 487 return true, nil 488 }, func(err error) { 489 t.Fatal(err) 490 }) 491 }