github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/structs/devices_test.go (about)

     1  package structs
     2  
     3  import (
     4  	"testing"
     5  
     6  	"github.com/hashicorp/nomad/ci"
     7  	"github.com/hashicorp/nomad/helper/uuid"
     8  	psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
     9  	"github.com/shoenig/test/must"
    10  	"github.com/stretchr/testify/require"
    11  )
    12  
    13  // nvidiaAllocatedDevice returns an allocated nvidia device
    14  func nvidiaAllocatedDevice() *AllocatedDeviceResource {
    15  	return &AllocatedDeviceResource{
    16  		Type:      "gpu",
    17  		Vendor:    "nvidia",
    18  		Name:      "1080ti",
    19  		DeviceIDs: []string{uuid.Generate()},
    20  	}
    21  }
    22  
    23  // nvidiaAlloc returns an allocation that has been assigned an nvidia device.
    24  func nvidiaAlloc() *Allocation {
    25  	a := MockAlloc()
    26  	a.AllocatedResources.Tasks["web"].Devices = []*AllocatedDeviceResource{
    27  		nvidiaAllocatedDevice(),
    28  	}
    29  	return a
    30  }
    31  
    32  // devNode returns a node containing two devices, an nvidia gpu and an intel
    33  // FPGA.
    34  func devNode() *Node {
    35  	n := MockNvidiaNode()
    36  	n.NodeResources.Devices = append(n.NodeResources.Devices, &NodeDeviceResource{
    37  		Type:   "fpga",
    38  		Vendor: "intel",
    39  		Name:   "F100",
    40  		Attributes: map[string]*psstructs.Attribute{
    41  			"memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB),
    42  		},
    43  		Instances: []*NodeDevice{
    44  			{
    45  				ID:      uuid.Generate(),
    46  				Healthy: true,
    47  			},
    48  			{
    49  				ID:      uuid.Generate(),
    50  				Healthy: false,
    51  			},
    52  		},
    53  	})
    54  	return n
    55  }
    56  
    57  // Make sure that the device accounter works even if the node has no devices
    58  func TestDeviceAccounter_AddAllocs_NoDeviceNode(t *testing.T) {
    59  	ci.Parallel(t)
    60  
    61  	require := require.New(t)
    62  	n := MockNode()
    63  	d := NewDeviceAccounter(n)
    64  	require.NotNil(d)
    65  
    66  	// Create three allocations, one with a device, one without, and one
    67  	// terminal
    68  	a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc()
    69  	allocs := []*Allocation{a1, a2, a3}
    70  	a3.DesiredStatus = AllocDesiredStatusStop
    71  
    72  	require.False(d.AddAllocs(allocs))
    73  	require.Len(d.Devices, 0)
    74  }
    75  
    76  // Add allocs to a node with a device
    77  func TestDeviceAccounter_AddAllocs(t *testing.T) {
    78  	ci.Parallel(t)
    79  
    80  	require := require.New(t)
    81  	n := devNode()
    82  	d := NewDeviceAccounter(n)
    83  	require.NotNil(d)
    84  
    85  	// Create three allocations, one with a device, one without, and one
    86  	// terminal
    87  	a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc()
    88  
    89  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
    90  	intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
    91  	a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
    92  
    93  	allocs := []*Allocation{a1, a2, a3}
    94  	a3.DesiredStatus = AllocDesiredStatusStop
    95  
    96  	require.False(d.AddAllocs(allocs))
    97  	require.Len(d.Devices, 2)
    98  
    99  	// Check that we have two devices for nvidia and that one of them is used
   100  	nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()]
   101  	require.True(ok)
   102  	require.Len(nvidiaDevice.Instances, 2)
   103  	require.Contains(nvidiaDevice.Instances, nvidiaDev0ID)
   104  	require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID])
   105  
   106  	// Check only one instance of the intel device is set up since the other is
   107  	// unhealthy
   108  	intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()]
   109  	require.True(ok)
   110  	require.Len(intelDevice.Instances, 1)
   111  	require.Equal(0, intelDevice.Instances[intelDev0ID])
   112  }
   113  
   114  // Add alloc with unknown ID to a node with devices. This tests that we can
   115  // operate on previous allocs even if the device has changed to unhealthy and we
   116  // don't track it
   117  func TestDeviceAccounter_AddAllocs_UnknownID(t *testing.T) {
   118  	ci.Parallel(t)
   119  
   120  	require := require.New(t)
   121  	n := devNode()
   122  	d := NewDeviceAccounter(n)
   123  	require.NotNil(d)
   124  
   125  	// Create three allocations, one with a device, one without, and one
   126  	// terminal
   127  	a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc()
   128  
   129  	// a2 will have a random ID since it is generated
   130  
   131  	allocs := []*Allocation{a1, a2, a3}
   132  	a3.DesiredStatus = AllocDesiredStatusStop
   133  
   134  	require.False(d.AddAllocs(allocs))
   135  	require.Len(d.Devices, 2)
   136  
   137  	// Check that we have two devices for nvidia and that one of them is used
   138  	nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()]
   139  	require.True(ok)
   140  	require.Len(nvidiaDevice.Instances, 2)
   141  	for _, v := range nvidiaDevice.Instances {
   142  		require.Equal(0, v)
   143  	}
   144  }
   145  
   146  // Test that collision detection works
   147  func TestDeviceAccounter_AddAllocs_Collision(t *testing.T) {
   148  	ci.Parallel(t)
   149  
   150  	require := require.New(t)
   151  	n := devNode()
   152  	d := NewDeviceAccounter(n)
   153  	require.NotNil(d)
   154  
   155  	// Create two allocations, both with the same device
   156  	a1, a2 := nvidiaAlloc(), nvidiaAlloc()
   157  
   158  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   159  	a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   160  	a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   161  
   162  	allocs := []*Allocation{a1, a2}
   163  	require.True(d.AddAllocs(allocs))
   164  }
   165  
   166  // Assert that devices are not freed when an alloc's ServerTerminalStatus is
   167  // true, but only when ClientTerminalStatus is true.
   168  func TestDeviceAccounter_AddAllocs_TerminalStatus(t *testing.T) {
   169  	ci.Parallel(t)
   170  
   171  	n := devNode()
   172  	d := NewDeviceAccounter(n)
   173  
   174  	// Create two allocations, both with the same device. First is being told to
   175  	// stop but has not stopped yet.
   176  	a1, a2 := nvidiaAlloc(), nvidiaAlloc()
   177  	a1.DesiredStatus = AllocDesiredStatusStop
   178  	a1.ClientStatus = AllocClientStatusRunning
   179  
   180  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   181  	a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   182  	a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   183  
   184  	allocs := []*Allocation{a1, a2}
   185  
   186  	// Since a1 has not stopped on the client, its device is still in use
   187  	must.True(t, d.AddAllocs(allocs))
   188  
   189  	// Assert that stop a1 on the client frees the device for use by a2
   190  	a1.ClientStatus = AllocClientStatusComplete
   191  	d = NewDeviceAccounter(n)
   192  	must.False(t, d.AddAllocs(allocs))
   193  }
   194  
   195  // Make sure that the device allocator works even if the node has no devices
   196  func TestDeviceAccounter_AddReserved_NoDeviceNode(t *testing.T) {
   197  	ci.Parallel(t)
   198  
   199  	require := require.New(t)
   200  	n := MockNode()
   201  	d := NewDeviceAccounter(n)
   202  	require.NotNil(d)
   203  
   204  	require.False(d.AddReserved(nvidiaAllocatedDevice()))
   205  	require.Len(d.Devices, 0)
   206  }
   207  
   208  // Add reserved to a node with a device
   209  func TestDeviceAccounter_AddReserved(t *testing.T) {
   210  	ci.Parallel(t)
   211  
   212  	require := require.New(t)
   213  	n := devNode()
   214  	d := NewDeviceAccounter(n)
   215  	require.NotNil(d)
   216  
   217  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   218  	intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
   219  
   220  	res := nvidiaAllocatedDevice()
   221  	res.DeviceIDs = []string{nvidiaDev0ID}
   222  
   223  	require.False(d.AddReserved(res))
   224  	require.Len(d.Devices, 2)
   225  
   226  	// Check that we have two devices for nvidia and that one of them is used
   227  	nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()]
   228  	require.True(ok)
   229  	require.Len(nvidiaDevice.Instances, 2)
   230  	require.Contains(nvidiaDevice.Instances, nvidiaDev0ID)
   231  	require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID])
   232  
   233  	// Check only one instance of the intel device is set up since the other is
   234  	// unhealthy
   235  	intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()]
   236  	require.True(ok)
   237  	require.Len(intelDevice.Instances, 1)
   238  	require.Equal(0, intelDevice.Instances[intelDev0ID])
   239  }
   240  
   241  // Test that collision detection works
   242  func TestDeviceAccounter_AddReserved_Collision(t *testing.T) {
   243  	ci.Parallel(t)
   244  
   245  	require := require.New(t)
   246  	n := devNode()
   247  	d := NewDeviceAccounter(n)
   248  	require.NotNil(d)
   249  
   250  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   251  
   252  	// Create an alloc with nvidia
   253  	a1 := nvidiaAlloc()
   254  	a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   255  	require.False(d.AddAllocs([]*Allocation{a1}))
   256  
   257  	// Reserve the same device
   258  	res := nvidiaAllocatedDevice()
   259  	res.DeviceIDs = []string{nvidiaDev0ID}
   260  	require.True(d.AddReserved(res))
   261  }