github.com/hernad/nomad@v1.6.112/nomad/structs/devices_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package structs
     5  
     6  import (
     7  	"testing"
     8  
     9  	"github.com/hernad/nomad/ci"
    10  	"github.com/hernad/nomad/helper/uuid"
    11  	psstructs "github.com/hernad/nomad/plugins/shared/structs"
    12  	"github.com/shoenig/test/must"
    13  	"github.com/stretchr/testify/require"
    14  )
    15  
    16  // nvidiaAllocatedDevice returns an allocated nvidia device
    17  func nvidiaAllocatedDevice() *AllocatedDeviceResource {
    18  	return &AllocatedDeviceResource{
    19  		Type:      "gpu",
    20  		Vendor:    "nvidia",
    21  		Name:      "1080ti",
    22  		DeviceIDs: []string{uuid.Generate()},
    23  	}
    24  }
    25  
    26  // nvidiaAlloc returns an allocation that has been assigned an nvidia device.
    27  func nvidiaAlloc() *Allocation {
    28  	a := MockAlloc()
    29  	a.AllocatedResources.Tasks["web"].Devices = []*AllocatedDeviceResource{
    30  		nvidiaAllocatedDevice(),
    31  	}
    32  	return a
    33  }
    34  
    35  // devNode returns a node containing two devices, an nvidia gpu and an intel
    36  // FPGA.
    37  func devNode() *Node {
    38  	n := MockNvidiaNode()
    39  	n.NodeResources.Devices = append(n.NodeResources.Devices, &NodeDeviceResource{
    40  		Type:   "fpga",
    41  		Vendor: "intel",
    42  		Name:   "F100",
    43  		Attributes: map[string]*psstructs.Attribute{
    44  			"memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB),
    45  		},
    46  		Instances: []*NodeDevice{
    47  			{
    48  				ID:      uuid.Generate(),
    49  				Healthy: true,
    50  			},
    51  			{
    52  				ID:      uuid.Generate(),
    53  				Healthy: false,
    54  			},
    55  		},
    56  	})
    57  	return n
    58  }
    59  
    60  // Make sure that the device accounter works even if the node has no devices
    61  func TestDeviceAccounter_AddAllocs_NoDeviceNode(t *testing.T) {
    62  	ci.Parallel(t)
    63  
    64  	require := require.New(t)
    65  	n := MockNode()
    66  	d := NewDeviceAccounter(n)
    67  	require.NotNil(d)
    68  
    69  	// Create three allocations, one with a device, one without, and one
    70  	// terminal
    71  	a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc()
    72  	allocs := []*Allocation{a1, a2, a3}
    73  	a3.DesiredStatus = AllocDesiredStatusStop
    74  
    75  	require.False(d.AddAllocs(allocs))
    76  	require.Len(d.Devices, 0)
    77  }
    78  
    79  // Add allocs to a node with a device
    80  func TestDeviceAccounter_AddAllocs(t *testing.T) {
    81  	ci.Parallel(t)
    82  
    83  	require := require.New(t)
    84  	n := devNode()
    85  	d := NewDeviceAccounter(n)
    86  	require.NotNil(d)
    87  
    88  	// Create three allocations, one with a device, one without, and one
    89  	// terminal
    90  	a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc()
    91  
    92  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
    93  	intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
    94  	a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
    95  
    96  	allocs := []*Allocation{a1, a2, a3}
    97  	a3.DesiredStatus = AllocDesiredStatusStop
    98  
    99  	require.False(d.AddAllocs(allocs))
   100  	require.Len(d.Devices, 2)
   101  
   102  	// Check that we have two devices for nvidia and that one of them is used
   103  	nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()]
   104  	require.True(ok)
   105  	require.Len(nvidiaDevice.Instances, 2)
   106  	require.Contains(nvidiaDevice.Instances, nvidiaDev0ID)
   107  	require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID])
   108  
   109  	// Check only one instance of the intel device is set up since the other is
   110  	// unhealthy
   111  	intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()]
   112  	require.True(ok)
   113  	require.Len(intelDevice.Instances, 1)
   114  	require.Equal(0, intelDevice.Instances[intelDev0ID])
   115  }
   116  
   117  // Add alloc with unknown ID to a node with devices. This tests that we can
   118  // operate on previous allocs even if the device has changed to unhealthy and we
   119  // don't track it
   120  func TestDeviceAccounter_AddAllocs_UnknownID(t *testing.T) {
   121  	ci.Parallel(t)
   122  
   123  	require := require.New(t)
   124  	n := devNode()
   125  	d := NewDeviceAccounter(n)
   126  	require.NotNil(d)
   127  
   128  	// Create three allocations, one with a device, one without, and one
   129  	// terminal
   130  	a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc()
   131  
   132  	// a2 will have a random ID since it is generated
   133  
   134  	allocs := []*Allocation{a1, a2, a3}
   135  	a3.DesiredStatus = AllocDesiredStatusStop
   136  
   137  	require.False(d.AddAllocs(allocs))
   138  	require.Len(d.Devices, 2)
   139  
   140  	// Check that we have two devices for nvidia and that one of them is used
   141  	nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()]
   142  	require.True(ok)
   143  	require.Len(nvidiaDevice.Instances, 2)
   144  	for _, v := range nvidiaDevice.Instances {
   145  		require.Equal(0, v)
   146  	}
   147  }
   148  
   149  // Test that collision detection works
   150  func TestDeviceAccounter_AddAllocs_Collision(t *testing.T) {
   151  	ci.Parallel(t)
   152  
   153  	require := require.New(t)
   154  	n := devNode()
   155  	d := NewDeviceAccounter(n)
   156  	require.NotNil(d)
   157  
   158  	// Create two allocations, both with the same device
   159  	a1, a2 := nvidiaAlloc(), nvidiaAlloc()
   160  
   161  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   162  	a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   163  	a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   164  
   165  	allocs := []*Allocation{a1, a2}
   166  	require.True(d.AddAllocs(allocs))
   167  }
   168  
   169  // Assert that devices are not freed when an alloc's ServerTerminalStatus is
   170  // true, but only when ClientTerminalStatus is true.
   171  func TestDeviceAccounter_AddAllocs_TerminalStatus(t *testing.T) {
   172  	ci.Parallel(t)
   173  
   174  	n := devNode()
   175  	d := NewDeviceAccounter(n)
   176  
   177  	// Create two allocations, both with the same device. First is being told to
   178  	// stop but has not stopped yet.
   179  	a1, a2 := nvidiaAlloc(), nvidiaAlloc()
   180  	a1.DesiredStatus = AllocDesiredStatusStop
   181  	a1.ClientStatus = AllocClientStatusRunning
   182  
   183  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   184  	a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   185  	a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   186  
   187  	allocs := []*Allocation{a1, a2}
   188  
   189  	// Since a1 has not stopped on the client, its device is still in use
   190  	must.True(t, d.AddAllocs(allocs))
   191  
   192  	// Assert that stop a1 on the client frees the device for use by a2
   193  	a1.ClientStatus = AllocClientStatusComplete
   194  	d = NewDeviceAccounter(n)
   195  	must.False(t, d.AddAllocs(allocs))
   196  }
   197  
   198  // Make sure that the device allocator works even if the node has no devices
   199  func TestDeviceAccounter_AddReserved_NoDeviceNode(t *testing.T) {
   200  	ci.Parallel(t)
   201  
   202  	require := require.New(t)
   203  	n := MockNode()
   204  	d := NewDeviceAccounter(n)
   205  	require.NotNil(d)
   206  
   207  	require.False(d.AddReserved(nvidiaAllocatedDevice()))
   208  	require.Len(d.Devices, 0)
   209  }
   210  
   211  // Add reserved to a node with a device
   212  func TestDeviceAccounter_AddReserved(t *testing.T) {
   213  	ci.Parallel(t)
   214  
   215  	require := require.New(t)
   216  	n := devNode()
   217  	d := NewDeviceAccounter(n)
   218  	require.NotNil(d)
   219  
   220  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   221  	intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
   222  
   223  	res := nvidiaAllocatedDevice()
   224  	res.DeviceIDs = []string{nvidiaDev0ID}
   225  
   226  	require.False(d.AddReserved(res))
   227  	require.Len(d.Devices, 2)
   228  
   229  	// Check that we have two devices for nvidia and that one of them is used
   230  	nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()]
   231  	require.True(ok)
   232  	require.Len(nvidiaDevice.Instances, 2)
   233  	require.Contains(nvidiaDevice.Instances, nvidiaDev0ID)
   234  	require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID])
   235  
   236  	// Check only one instance of the intel device is set up since the other is
   237  	// unhealthy
   238  	intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()]
   239  	require.True(ok)
   240  	require.Len(intelDevice.Instances, 1)
   241  	require.Equal(0, intelDevice.Instances[intelDev0ID])
   242  }
   243  
   244  // Test that collision detection works
   245  func TestDeviceAccounter_AddReserved_Collision(t *testing.T) {
   246  	ci.Parallel(t)
   247  
   248  	require := require.New(t)
   249  	n := devNode()
   250  	d := NewDeviceAccounter(n)
   251  	require.NotNil(d)
   252  
   253  	nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
   254  
   255  	// Create an alloc with nvidia
   256  	a1 := nvidiaAlloc()
   257  	a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
   258  	require.False(d.AddAllocs([]*Allocation{a1}))
   259  
   260  	// Reserve the same device
   261  	res := nvidiaAllocatedDevice()
   262  	res.DeviceIDs = []string{nvidiaDev0ID}
   263  	require.True(d.AddReserved(res))
   264  }