github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/structs/devices_test.go (about) 1 package structs 2 3 import ( 4 "testing" 5 6 "github.com/hashicorp/nomad/ci" 7 "github.com/hashicorp/nomad/helper/uuid" 8 psstructs "github.com/hashicorp/nomad/plugins/shared/structs" 9 "github.com/shoenig/test/must" 10 "github.com/stretchr/testify/require" 11 ) 12 13 // nvidiaAllocatedDevice returns an allocated nvidia device 14 func nvidiaAllocatedDevice() *AllocatedDeviceResource { 15 return &AllocatedDeviceResource{ 16 Type: "gpu", 17 Vendor: "nvidia", 18 Name: "1080ti", 19 DeviceIDs: []string{uuid.Generate()}, 20 } 21 } 22 23 // nvidiaAlloc returns an allocation that has been assigned an nvidia device. 24 func nvidiaAlloc() *Allocation { 25 a := MockAlloc() 26 a.AllocatedResources.Tasks["web"].Devices = []*AllocatedDeviceResource{ 27 nvidiaAllocatedDevice(), 28 } 29 return a 30 } 31 32 // devNode returns a node containing two devices, an nvidia gpu and an intel 33 // FPGA. 34 func devNode() *Node { 35 n := MockNvidiaNode() 36 n.NodeResources.Devices = append(n.NodeResources.Devices, &NodeDeviceResource{ 37 Type: "fpga", 38 Vendor: "intel", 39 Name: "F100", 40 Attributes: map[string]*psstructs.Attribute{ 41 "memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB), 42 }, 43 Instances: []*NodeDevice{ 44 { 45 ID: uuid.Generate(), 46 Healthy: true, 47 }, 48 { 49 ID: uuid.Generate(), 50 Healthy: false, 51 }, 52 }, 53 }) 54 return n 55 } 56 57 // Make sure that the device accounter works even if the node has no devices 58 func TestDeviceAccounter_AddAllocs_NoDeviceNode(t *testing.T) { 59 ci.Parallel(t) 60 61 require := require.New(t) 62 n := MockNode() 63 d := NewDeviceAccounter(n) 64 require.NotNil(d) 65 66 // Create three allocations, one with a device, one without, and one 67 // terminal 68 a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc() 69 allocs := []*Allocation{a1, a2, a3} 70 a3.DesiredStatus = AllocDesiredStatusStop 71 72 require.False(d.AddAllocs(allocs)) 73 require.Len(d.Devices, 0) 74 } 75 76 // Add allocs to a node with a device 77 func TestDeviceAccounter_AddAllocs(t *testing.T) { 78 ci.Parallel(t) 79 80 require := require.New(t) 81 n := devNode() 82 d := NewDeviceAccounter(n) 83 require.NotNil(d) 84 85 // Create three allocations, one with a device, one without, and one 86 // terminal 87 a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc() 88 89 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 90 intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID 91 a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 92 93 allocs := []*Allocation{a1, a2, a3} 94 a3.DesiredStatus = AllocDesiredStatusStop 95 96 require.False(d.AddAllocs(allocs)) 97 require.Len(d.Devices, 2) 98 99 // Check that we have two devices for nvidia and that one of them is used 100 nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()] 101 require.True(ok) 102 require.Len(nvidiaDevice.Instances, 2) 103 require.Contains(nvidiaDevice.Instances, nvidiaDev0ID) 104 require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID]) 105 106 // Check only one instance of the intel device is set up since the other is 107 // unhealthy 108 intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()] 109 require.True(ok) 110 require.Len(intelDevice.Instances, 1) 111 require.Equal(0, intelDevice.Instances[intelDev0ID]) 112 } 113 114 // Add alloc with unknown ID to a node with devices. This tests that we can 115 // operate on previous allocs even if the device has changed to unhealthy and we 116 // don't track it 117 func TestDeviceAccounter_AddAllocs_UnknownID(t *testing.T) { 118 ci.Parallel(t) 119 120 require := require.New(t) 121 n := devNode() 122 d := NewDeviceAccounter(n) 123 require.NotNil(d) 124 125 // Create three allocations, one with a device, one without, and one 126 // terminal 127 a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc() 128 129 // a2 will have a random ID since it is generated 130 131 allocs := []*Allocation{a1, a2, a3} 132 a3.DesiredStatus = AllocDesiredStatusStop 133 134 require.False(d.AddAllocs(allocs)) 135 require.Len(d.Devices, 2) 136 137 // Check that we have two devices for nvidia and that one of them is used 138 nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()] 139 require.True(ok) 140 require.Len(nvidiaDevice.Instances, 2) 141 for _, v := range nvidiaDevice.Instances { 142 require.Equal(0, v) 143 } 144 } 145 146 // Test that collision detection works 147 func TestDeviceAccounter_AddAllocs_Collision(t *testing.T) { 148 ci.Parallel(t) 149 150 require := require.New(t) 151 n := devNode() 152 d := NewDeviceAccounter(n) 153 require.NotNil(d) 154 155 // Create two allocations, both with the same device 156 a1, a2 := nvidiaAlloc(), nvidiaAlloc() 157 158 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 159 a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 160 a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 161 162 allocs := []*Allocation{a1, a2} 163 require.True(d.AddAllocs(allocs)) 164 } 165 166 // Assert that devices are not freed when an alloc's ServerTerminalStatus is 167 // true, but only when ClientTerminalStatus is true. 168 func TestDeviceAccounter_AddAllocs_TerminalStatus(t *testing.T) { 169 ci.Parallel(t) 170 171 n := devNode() 172 d := NewDeviceAccounter(n) 173 174 // Create two allocations, both with the same device. First is being told to 175 // stop but has not stopped yet. 176 a1, a2 := nvidiaAlloc(), nvidiaAlloc() 177 a1.DesiredStatus = AllocDesiredStatusStop 178 a1.ClientStatus = AllocClientStatusRunning 179 180 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 181 a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 182 a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 183 184 allocs := []*Allocation{a1, a2} 185 186 // Since a1 has not stopped on the client, its device is still in use 187 must.True(t, d.AddAllocs(allocs)) 188 189 // Assert that stop a1 on the client frees the device for use by a2 190 a1.ClientStatus = AllocClientStatusComplete 191 d = NewDeviceAccounter(n) 192 must.False(t, d.AddAllocs(allocs)) 193 } 194 195 // Make sure that the device allocator works even if the node has no devices 196 func TestDeviceAccounter_AddReserved_NoDeviceNode(t *testing.T) { 197 ci.Parallel(t) 198 199 require := require.New(t) 200 n := MockNode() 201 d := NewDeviceAccounter(n) 202 require.NotNil(d) 203 204 require.False(d.AddReserved(nvidiaAllocatedDevice())) 205 require.Len(d.Devices, 0) 206 } 207 208 // Add reserved to a node with a device 209 func TestDeviceAccounter_AddReserved(t *testing.T) { 210 ci.Parallel(t) 211 212 require := require.New(t) 213 n := devNode() 214 d := NewDeviceAccounter(n) 215 require.NotNil(d) 216 217 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 218 intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID 219 220 res := nvidiaAllocatedDevice() 221 res.DeviceIDs = []string{nvidiaDev0ID} 222 223 require.False(d.AddReserved(res)) 224 require.Len(d.Devices, 2) 225 226 // Check that we have two devices for nvidia and that one of them is used 227 nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()] 228 require.True(ok) 229 require.Len(nvidiaDevice.Instances, 2) 230 require.Contains(nvidiaDevice.Instances, nvidiaDev0ID) 231 require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID]) 232 233 // Check only one instance of the intel device is set up since the other is 234 // unhealthy 235 intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()] 236 require.True(ok) 237 require.Len(intelDevice.Instances, 1) 238 require.Equal(0, intelDevice.Instances[intelDev0ID]) 239 } 240 241 // Test that collision detection works 242 func TestDeviceAccounter_AddReserved_Collision(t *testing.T) { 243 ci.Parallel(t) 244 245 require := require.New(t) 246 n := devNode() 247 d := NewDeviceAccounter(n) 248 require.NotNil(d) 249 250 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 251 252 // Create an alloc with nvidia 253 a1 := nvidiaAlloc() 254 a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 255 require.False(d.AddAllocs([]*Allocation{a1})) 256 257 // Reserve the same device 258 res := nvidiaAllocatedDevice() 259 res.DeviceIDs = []string{nvidiaDev0ID} 260 require.True(d.AddReserved(res)) 261 }