github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/devicemanager/manager_test.go (about)

     1  package devicemanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"testing"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	plugin "github.com/hashicorp/go-plugin"
    12  	"github.com/hashicorp/nomad/client/state"
    13  	"github.com/hashicorp/nomad/helper"
    14  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    15  	"github.com/hashicorp/nomad/helper/testlog"
    16  	"github.com/hashicorp/nomad/helper/uuid"
    17  	"github.com/hashicorp/nomad/nomad/structs"
    18  	"github.com/hashicorp/nomad/plugins/base"
    19  	"github.com/hashicorp/nomad/plugins/device"
    20  	psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
    21  	"github.com/hashicorp/nomad/testutil"
    22  	"github.com/stretchr/testify/require"
    23  )
    24  
    25  var (
    26  	nvidiaDevice0ID   = uuid.Generate()
    27  	nvidiaDevice1ID   = uuid.Generate()
    28  	nvidiaDeviceGroup = &device.DeviceGroup{
    29  		Vendor: "nvidia",
    30  		Type:   "gpu",
    31  		Name:   "1080ti",
    32  		Devices: []*device.Device{
    33  			{
    34  				ID:      nvidiaDevice0ID,
    35  				Healthy: true,
    36  			},
    37  			{
    38  				ID:      nvidiaDevice1ID,
    39  				Healthy: true,
    40  			},
    41  		},
    42  		Attributes: map[string]*psstructs.Attribute{
    43  			"memory": {
    44  				Int:  helper.Int64ToPtr(4),
    45  				Unit: "GB",
    46  			},
    47  		},
    48  	}
    49  
    50  	intelDeviceID    = uuid.Generate()
    51  	intelDeviceGroup = &device.DeviceGroup{
    52  		Vendor: "intel",
    53  		Type:   "gpu",
    54  		Name:   "640GT",
    55  		Devices: []*device.Device{
    56  			{
    57  				ID:      intelDeviceID,
    58  				Healthy: true,
    59  			},
    60  		},
    61  		Attributes: map[string]*psstructs.Attribute{
    62  			"memory": {
    63  				Int:  helper.Int64ToPtr(2),
    64  				Unit: "GB",
    65  			},
    66  		},
    67  	}
    68  
    69  	nvidiaDeviceGroupStats = &device.DeviceGroupStats{
    70  		Vendor: "nvidia",
    71  		Type:   "gpu",
    72  		Name:   "1080ti",
    73  		InstanceStats: map[string]*device.DeviceStats{
    74  			nvidiaDevice0ID: {
    75  				Summary: &psstructs.StatValue{
    76  					IntNumeratorVal: helper.Int64ToPtr(212),
    77  					Unit:            "F",
    78  					Desc:            "Temperature",
    79  				},
    80  			},
    81  			nvidiaDevice1ID: {
    82  				Summary: &psstructs.StatValue{
    83  					IntNumeratorVal: helper.Int64ToPtr(218),
    84  					Unit:            "F",
    85  					Desc:            "Temperature",
    86  				},
    87  			},
    88  		},
    89  	}
    90  
    91  	intelDeviceGroupStats = &device.DeviceGroupStats{
    92  		Vendor: "intel",
    93  		Type:   "gpu",
    94  		Name:   "640GT",
    95  		InstanceStats: map[string]*device.DeviceStats{
    96  			intelDeviceID: {
    97  				Summary: &psstructs.StatValue{
    98  					IntNumeratorVal: helper.Int64ToPtr(220),
    99  					Unit:            "F",
   100  					Desc:            "Temperature",
   101  				},
   102  			},
   103  		},
   104  	}
   105  )
   106  
   107  func baseTestConfig(t *testing.T) (
   108  	config *Config,
   109  	deviceUpdateCh chan []*structs.NodeDeviceResource,
   110  	catalog *loader.MockCatalog) {
   111  
   112  	// Create an update handler
   113  	deviceUpdates := make(chan []*structs.NodeDeviceResource, 1)
   114  	updateFn := func(devices []*structs.NodeDeviceResource) {
   115  		deviceUpdates <- devices
   116  	}
   117  
   118  	// Create a mock plugin catalog
   119  	mc := &loader.MockCatalog{}
   120  
   121  	// Create the config
   122  	logger := testlog.HCLogger(t)
   123  	config = &Config{
   124  		Logger:        logger,
   125  		PluginConfig:  &base.AgentConfig{},
   126  		StatsInterval: 100 * time.Millisecond,
   127  		State:         state.NewMemDB(logger),
   128  		Updater:       updateFn,
   129  		Loader:        mc,
   130  	}
   131  
   132  	return config, deviceUpdates, mc
   133  }
   134  
   135  func configureCatalogWith(catalog *loader.MockCatalog, plugins map[*base.PluginInfoResponse]loader.PluginInstance) {
   136  
   137  	catalog.DispenseF = func(name, _ string, _ *base.AgentConfig, _ log.Logger) (loader.PluginInstance, error) {
   138  		for info, v := range plugins {
   139  			if info.Name == name {
   140  				return v, nil
   141  			}
   142  		}
   143  
   144  		return nil, fmt.Errorf("no matching plugin")
   145  	}
   146  
   147  	catalog.ReattachF = func(name, _ string, _ *plugin.ReattachConfig) (loader.PluginInstance, error) {
   148  		for info, v := range plugins {
   149  			if info.Name == name {
   150  				return v, nil
   151  			}
   152  		}
   153  
   154  		return nil, fmt.Errorf("no matching plugin")
   155  	}
   156  
   157  	catalog.CatalogF = func() map[string][]*base.PluginInfoResponse {
   158  		devices := make([]*base.PluginInfoResponse, 0, len(plugins))
   159  		for k := range plugins {
   160  			devices = append(devices, k)
   161  		}
   162  		out := map[string][]*base.PluginInfoResponse{
   163  			base.PluginTypeDevice: devices,
   164  		}
   165  		return out
   166  	}
   167  }
   168  
   169  func pluginInfoResponse(name string) *base.PluginInfoResponse {
   170  	return &base.PluginInfoResponse{
   171  		Type:              base.PluginTypeDevice,
   172  		PluginApiVersions: []string{"v0.0.1"},
   173  		PluginVersion:     "v0.0.1",
   174  		Name:              name,
   175  	}
   176  }
   177  
   178  // drainNodeDeviceUpdates drains all updates to the node device fingerprint channel
   179  func drainNodeDeviceUpdates(ctx context.Context, in chan []*structs.NodeDeviceResource) {
   180  	go func() {
   181  		for {
   182  			select {
   183  			case <-ctx.Done():
   184  				return
   185  			case <-in:
   186  			}
   187  		}
   188  	}()
   189  }
   190  
   191  func deviceReserveFn(ids []string) (*device.ContainerReservation, error) {
   192  	return &device.ContainerReservation{
   193  		Envs: map[string]string{
   194  			"DEVICES": strings.Join(ids, ","),
   195  		},
   196  	}, nil
   197  }
   198  
   199  // nvidiaAndIntelDefaultPlugins adds an nvidia and intel mock plugin to the
   200  // catalog
   201  func nvidiaAndIntelDefaultPlugins(catalog *loader.MockCatalog) {
   202  	pluginInfoNvidia := pluginInfoResponse("nvidia")
   203  	deviceNvidia := &device.MockDevicePlugin{
   204  		MockPlugin: &base.MockPlugin{
   205  			PluginInfoF:   base.StaticInfo(pluginInfoNvidia),
   206  			ConfigSchemaF: base.TestConfigSchema(),
   207  			SetConfigF:    base.NoopSetConfig(),
   208  		},
   209  		FingerprintF: device.StaticFingerprinter([]*device.DeviceGroup{nvidiaDeviceGroup}),
   210  		ReserveF:     deviceReserveFn,
   211  		StatsF:       device.StaticStats([]*device.DeviceGroupStats{nvidiaDeviceGroupStats}),
   212  	}
   213  	pluginNvidia := loader.MockBasicExternalPlugin(deviceNvidia, device.ApiVersion010)
   214  
   215  	pluginInfoIntel := pluginInfoResponse("intel")
   216  	deviceIntel := &device.MockDevicePlugin{
   217  		MockPlugin: &base.MockPlugin{
   218  			PluginInfoF:   base.StaticInfo(pluginInfoIntel),
   219  			ConfigSchemaF: base.TestConfigSchema(),
   220  			SetConfigF:    base.NoopSetConfig(),
   221  		},
   222  		FingerprintF: device.StaticFingerprinter([]*device.DeviceGroup{intelDeviceGroup}),
   223  		ReserveF:     deviceReserveFn,
   224  		StatsF:       device.StaticStats([]*device.DeviceGroupStats{intelDeviceGroupStats}),
   225  	}
   226  	pluginIntel := loader.MockBasicExternalPlugin(deviceIntel, device.ApiVersion010)
   227  
   228  	// Configure the catalog with two plugins
   229  	configureCatalogWith(catalog, map[*base.PluginInfoResponse]loader.PluginInstance{
   230  		pluginInfoNvidia: pluginNvidia,
   231  		pluginInfoIntel:  pluginIntel,
   232  	})
   233  }
   234  
   235  // Test collecting statistics from all devices
   236  func TestManager_AllStats(t *testing.T) {
   237  	t.Parallel()
   238  	require := require.New(t)
   239  
   240  	config, _, catalog := baseTestConfig(t)
   241  	nvidiaAndIntelDefaultPlugins(catalog)
   242  
   243  	m := New(config)
   244  	m.Run()
   245  	defer m.Shutdown()
   246  	require.Len(m.instances, 2)
   247  
   248  	// Wait till we get a fingerprint result
   249  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   250  	defer cancel()
   251  	<-m.WaitForFirstFingerprint(ctx)
   252  	require.NoError(ctx.Err())
   253  
   254  	// Now collect all the stats
   255  	var stats []*device.DeviceGroupStats
   256  	testutil.WaitForResult(func() (bool, error) {
   257  		stats = m.AllStats()
   258  		l := len(stats)
   259  		if l == 2 {
   260  			return true, nil
   261  		}
   262  
   263  		return false, fmt.Errorf("expected count 2; got %d", l)
   264  	}, func(err error) {
   265  		t.Fatal(err)
   266  	})
   267  
   268  	// Check we got stats from both the devices
   269  	var nstats, istats bool
   270  	for _, stat := range stats {
   271  		switch stat.Vendor {
   272  		case "intel":
   273  			istats = true
   274  		case "nvidia":
   275  			nstats = true
   276  		default:
   277  			t.Fatalf("unexpected vendor %q", stat.Vendor)
   278  		}
   279  	}
   280  	require.True(nstats)
   281  	require.True(istats)
   282  }
   283  
   284  // Test collecting statistics from a particular device
   285  func TestManager_DeviceStats(t *testing.T) {
   286  	t.Parallel()
   287  	require := require.New(t)
   288  
   289  	config, _, catalog := baseTestConfig(t)
   290  	nvidiaAndIntelDefaultPlugins(catalog)
   291  
   292  	m := New(config)
   293  	m.Run()
   294  	defer m.Shutdown()
   295  
   296  	// Wait till we get a fingerprint result
   297  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   298  	defer cancel()
   299  	<-m.WaitForFirstFingerprint(ctx)
   300  	require.NoError(ctx.Err())
   301  
   302  	testutil.WaitForResult(func() (bool, error) {
   303  		stats := m.AllStats()
   304  		l := len(stats)
   305  		if l == 2 {
   306  			return true, nil
   307  		}
   308  
   309  		return false, fmt.Errorf("expected count 2; got %d", l)
   310  	}, func(err error) {
   311  		t.Fatal(err)
   312  	})
   313  
   314  	// Now collect the stats for one nvidia device
   315  	stat, err := m.DeviceStats(&structs.AllocatedDeviceResource{
   316  		Vendor:    "nvidia",
   317  		Type:      "gpu",
   318  		Name:      "1080ti",
   319  		DeviceIDs: []string{nvidiaDevice1ID},
   320  	})
   321  	require.NoError(err)
   322  	require.NotNil(stat)
   323  
   324  	require.Len(stat.InstanceStats, 1)
   325  	require.Contains(stat.InstanceStats, nvidiaDevice1ID)
   326  
   327  	istat := stat.InstanceStats[nvidiaDevice1ID]
   328  	require.EqualValues(218, *istat.Summary.IntNumeratorVal)
   329  }
   330  
   331  // Test reserving a particular device
   332  func TestManager_Reserve(t *testing.T) {
   333  	t.Parallel()
   334  	r := require.New(t)
   335  
   336  	config, _, catalog := baseTestConfig(t)
   337  	nvidiaAndIntelDefaultPlugins(catalog)
   338  
   339  	m := New(config)
   340  	m.Run()
   341  	defer m.Shutdown()
   342  
   343  	// Wait till we get a fingerprint result
   344  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   345  	defer cancel()
   346  	<-m.WaitForFirstFingerprint(ctx)
   347  	r.NoError(ctx.Err())
   348  
   349  	cases := []struct {
   350  		in       *structs.AllocatedDeviceResource
   351  		expected string
   352  		err      bool
   353  	}{
   354  		{
   355  			in: &structs.AllocatedDeviceResource{
   356  				Vendor:    "nvidia",
   357  				Type:      "gpu",
   358  				Name:      "1080ti",
   359  				DeviceIDs: []string{nvidiaDevice1ID},
   360  			},
   361  			expected: nvidiaDevice1ID,
   362  		},
   363  		{
   364  			in: &structs.AllocatedDeviceResource{
   365  				Vendor:    "nvidia",
   366  				Type:      "gpu",
   367  				Name:      "1080ti",
   368  				DeviceIDs: []string{nvidiaDevice0ID},
   369  			},
   370  			expected: nvidiaDevice0ID,
   371  		},
   372  		{
   373  			in: &structs.AllocatedDeviceResource{
   374  				Vendor:    "nvidia",
   375  				Type:      "gpu",
   376  				Name:      "1080ti",
   377  				DeviceIDs: []string{nvidiaDevice0ID, nvidiaDevice1ID},
   378  			},
   379  			expected: fmt.Sprintf("%s,%s", nvidiaDevice0ID, nvidiaDevice1ID),
   380  		},
   381  		{
   382  			in: &structs.AllocatedDeviceResource{
   383  				Vendor:    "nvidia",
   384  				Type:      "gpu",
   385  				Name:      "1080ti",
   386  				DeviceIDs: []string{nvidiaDevice0ID, nvidiaDevice1ID, "foo"},
   387  			},
   388  			err: true,
   389  		},
   390  		{
   391  			in: &structs.AllocatedDeviceResource{
   392  				Vendor:    "intel",
   393  				Type:      "gpu",
   394  				Name:      "640GT",
   395  				DeviceIDs: []string{intelDeviceID},
   396  			},
   397  			expected: intelDeviceID,
   398  		},
   399  		{
   400  			in: &structs.AllocatedDeviceResource{
   401  				Vendor:    "intel",
   402  				Type:      "gpu",
   403  				Name:      "foo",
   404  				DeviceIDs: []string{intelDeviceID},
   405  			},
   406  			err: true,
   407  		},
   408  	}
   409  
   410  	for i, c := range cases {
   411  		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
   412  			r = require.New(t)
   413  
   414  			// Reserve a particular device
   415  			res, err := m.Reserve(c.in)
   416  			if !c.err {
   417  				r.NoError(err)
   418  				r.NotNil(res)
   419  
   420  				r.Len(res.Envs, 1)
   421  				r.Equal(res.Envs["DEVICES"], c.expected)
   422  			} else {
   423  				r.Error(err)
   424  			}
   425  		})
   426  	}
   427  }
   428  
   429  // Test that shutdown shutsdown the plugins
   430  func TestManager_Shutdown(t *testing.T) {
   431  	t.Parallel()
   432  	require := require.New(t)
   433  
   434  	config, _, catalog := baseTestConfig(t)
   435  	nvidiaAndIntelDefaultPlugins(catalog)
   436  
   437  	m := New(config)
   438  	m.Run()
   439  	defer m.Shutdown()
   440  
   441  	// Wait till we get a fingerprint result
   442  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   443  	defer cancel()
   444  	<-m.WaitForFirstFingerprint(ctx)
   445  	require.NoError(ctx.Err())
   446  
   447  	// Call shutdown and assert that we killed the plugins
   448  	m.Shutdown()
   449  
   450  	for _, resp := range catalog.Catalog()[base.PluginTypeDevice] {
   451  		pinst, _ := catalog.Dispense(resp.Name, resp.Type, &base.AgentConfig{}, config.Logger)
   452  		require.True(pinst.Exited())
   453  	}
   454  }
   455  
   456  // Test that startup shutsdown previously launched plugins
   457  func TestManager_Run_ShutdownOld(t *testing.T) {
   458  	t.Parallel()
   459  	require := require.New(t)
   460  
   461  	config, _, catalog := baseTestConfig(t)
   462  	nvidiaAndIntelDefaultPlugins(catalog)
   463  
   464  	m := New(config)
   465  	m.Run()
   466  	defer m.Shutdown()
   467  
   468  	// Wait till we get a fingerprint result
   469  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   470  	defer cancel()
   471  	<-m.WaitForFirstFingerprint(ctx)
   472  	require.NoError(ctx.Err())
   473  
   474  	// Create a new manager with the same config so that it reads the old state
   475  	m2 := New(config)
   476  	go m2.Run()
   477  	defer m2.Shutdown()
   478  
   479  	testutil.WaitForResult(func() (bool, error) {
   480  		for _, resp := range catalog.Catalog()[base.PluginTypeDevice] {
   481  			pinst, _ := catalog.Dispense(resp.Name, resp.Type, &base.AgentConfig{}, config.Logger)
   482  			if !pinst.Exited() {
   483  				return false, fmt.Errorf("plugin %q not shutdown", resp.Name)
   484  			}
   485  		}
   486  
   487  		return true, nil
   488  	}, func(err error) {
   489  		t.Fatal(err)
   490  	})
   491  }