github.com/hernad/nomad@v1.6.112/nomad/structs/devices_test.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package structs 5 6 import ( 7 "testing" 8 9 "github.com/hernad/nomad/ci" 10 "github.com/hernad/nomad/helper/uuid" 11 psstructs "github.com/hernad/nomad/plugins/shared/structs" 12 "github.com/shoenig/test/must" 13 "github.com/stretchr/testify/require" 14 ) 15 16 // nvidiaAllocatedDevice returns an allocated nvidia device 17 func nvidiaAllocatedDevice() *AllocatedDeviceResource { 18 return &AllocatedDeviceResource{ 19 Type: "gpu", 20 Vendor: "nvidia", 21 Name: "1080ti", 22 DeviceIDs: []string{uuid.Generate()}, 23 } 24 } 25 26 // nvidiaAlloc returns an allocation that has been assigned an nvidia device. 27 func nvidiaAlloc() *Allocation { 28 a := MockAlloc() 29 a.AllocatedResources.Tasks["web"].Devices = []*AllocatedDeviceResource{ 30 nvidiaAllocatedDevice(), 31 } 32 return a 33 } 34 35 // devNode returns a node containing two devices, an nvidia gpu and an intel 36 // FPGA. 37 func devNode() *Node { 38 n := MockNvidiaNode() 39 n.NodeResources.Devices = append(n.NodeResources.Devices, &NodeDeviceResource{ 40 Type: "fpga", 41 Vendor: "intel", 42 Name: "F100", 43 Attributes: map[string]*psstructs.Attribute{ 44 "memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB), 45 }, 46 Instances: []*NodeDevice{ 47 { 48 ID: uuid.Generate(), 49 Healthy: true, 50 }, 51 { 52 ID: uuid.Generate(), 53 Healthy: false, 54 }, 55 }, 56 }) 57 return n 58 } 59 60 // Make sure that the device accounter works even if the node has no devices 61 func TestDeviceAccounter_AddAllocs_NoDeviceNode(t *testing.T) { 62 ci.Parallel(t) 63 64 require := require.New(t) 65 n := MockNode() 66 d := NewDeviceAccounter(n) 67 require.NotNil(d) 68 69 // Create three allocations, one with a device, one without, and one 70 // terminal 71 a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc() 72 allocs := []*Allocation{a1, a2, a3} 73 a3.DesiredStatus = AllocDesiredStatusStop 74 75 require.False(d.AddAllocs(allocs)) 76 require.Len(d.Devices, 0) 77 } 78 79 // Add allocs to a node with a device 80 func TestDeviceAccounter_AddAllocs(t *testing.T) { 81 ci.Parallel(t) 82 83 require := require.New(t) 84 n := devNode() 85 d := NewDeviceAccounter(n) 86 require.NotNil(d) 87 88 // Create three allocations, one with a device, one without, and one 89 // terminal 90 a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc() 91 92 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 93 intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID 94 a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 95 96 allocs := []*Allocation{a1, a2, a3} 97 a3.DesiredStatus = AllocDesiredStatusStop 98 99 require.False(d.AddAllocs(allocs)) 100 require.Len(d.Devices, 2) 101 102 // Check that we have two devices for nvidia and that one of them is used 103 nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()] 104 require.True(ok) 105 require.Len(nvidiaDevice.Instances, 2) 106 require.Contains(nvidiaDevice.Instances, nvidiaDev0ID) 107 require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID]) 108 109 // Check only one instance of the intel device is set up since the other is 110 // unhealthy 111 intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()] 112 require.True(ok) 113 require.Len(intelDevice.Instances, 1) 114 require.Equal(0, intelDevice.Instances[intelDev0ID]) 115 } 116 117 // Add alloc with unknown ID to a node with devices. This tests that we can 118 // operate on previous allocs even if the device has changed to unhealthy and we 119 // don't track it 120 func TestDeviceAccounter_AddAllocs_UnknownID(t *testing.T) { 121 ci.Parallel(t) 122 123 require := require.New(t) 124 n := devNode() 125 d := NewDeviceAccounter(n) 126 require.NotNil(d) 127 128 // Create three allocations, one with a device, one without, and one 129 // terminal 130 a1, a2, a3 := MockAlloc(), nvidiaAlloc(), MockAlloc() 131 132 // a2 will have a random ID since it is generated 133 134 allocs := []*Allocation{a1, a2, a3} 135 a3.DesiredStatus = AllocDesiredStatusStop 136 137 require.False(d.AddAllocs(allocs)) 138 require.Len(d.Devices, 2) 139 140 // Check that we have two devices for nvidia and that one of them is used 141 nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()] 142 require.True(ok) 143 require.Len(nvidiaDevice.Instances, 2) 144 for _, v := range nvidiaDevice.Instances { 145 require.Equal(0, v) 146 } 147 } 148 149 // Test that collision detection works 150 func TestDeviceAccounter_AddAllocs_Collision(t *testing.T) { 151 ci.Parallel(t) 152 153 require := require.New(t) 154 n := devNode() 155 d := NewDeviceAccounter(n) 156 require.NotNil(d) 157 158 // Create two allocations, both with the same device 159 a1, a2 := nvidiaAlloc(), nvidiaAlloc() 160 161 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 162 a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 163 a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 164 165 allocs := []*Allocation{a1, a2} 166 require.True(d.AddAllocs(allocs)) 167 } 168 169 // Assert that devices are not freed when an alloc's ServerTerminalStatus is 170 // true, but only when ClientTerminalStatus is true. 171 func TestDeviceAccounter_AddAllocs_TerminalStatus(t *testing.T) { 172 ci.Parallel(t) 173 174 n := devNode() 175 d := NewDeviceAccounter(n) 176 177 // Create two allocations, both with the same device. First is being told to 178 // stop but has not stopped yet. 179 a1, a2 := nvidiaAlloc(), nvidiaAlloc() 180 a1.DesiredStatus = AllocDesiredStatusStop 181 a1.ClientStatus = AllocClientStatusRunning 182 183 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 184 a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 185 a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 186 187 allocs := []*Allocation{a1, a2} 188 189 // Since a1 has not stopped on the client, its device is still in use 190 must.True(t, d.AddAllocs(allocs)) 191 192 // Assert that stop a1 on the client frees the device for use by a2 193 a1.ClientStatus = AllocClientStatusComplete 194 d = NewDeviceAccounter(n) 195 must.False(t, d.AddAllocs(allocs)) 196 } 197 198 // Make sure that the device allocator works even if the node has no devices 199 func TestDeviceAccounter_AddReserved_NoDeviceNode(t *testing.T) { 200 ci.Parallel(t) 201 202 require := require.New(t) 203 n := MockNode() 204 d := NewDeviceAccounter(n) 205 require.NotNil(d) 206 207 require.False(d.AddReserved(nvidiaAllocatedDevice())) 208 require.Len(d.Devices, 0) 209 } 210 211 // Add reserved to a node with a device 212 func TestDeviceAccounter_AddReserved(t *testing.T) { 213 ci.Parallel(t) 214 215 require := require.New(t) 216 n := devNode() 217 d := NewDeviceAccounter(n) 218 require.NotNil(d) 219 220 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 221 intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID 222 223 res := nvidiaAllocatedDevice() 224 res.DeviceIDs = []string{nvidiaDev0ID} 225 226 require.False(d.AddReserved(res)) 227 require.Len(d.Devices, 2) 228 229 // Check that we have two devices for nvidia and that one of them is used 230 nvidiaDevice, ok := d.Devices[*n.NodeResources.Devices[0].ID()] 231 require.True(ok) 232 require.Len(nvidiaDevice.Instances, 2) 233 require.Contains(nvidiaDevice.Instances, nvidiaDev0ID) 234 require.Equal(1, nvidiaDevice.Instances[nvidiaDev0ID]) 235 236 // Check only one instance of the intel device is set up since the other is 237 // unhealthy 238 intelDevice, ok := d.Devices[*n.NodeResources.Devices[1].ID()] 239 require.True(ok) 240 require.Len(intelDevice.Instances, 1) 241 require.Equal(0, intelDevice.Instances[intelDev0ID]) 242 } 243 244 // Test that collision detection works 245 func TestDeviceAccounter_AddReserved_Collision(t *testing.T) { 246 ci.Parallel(t) 247 248 require := require.New(t) 249 n := devNode() 250 d := NewDeviceAccounter(n) 251 require.NotNil(d) 252 253 nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID 254 255 // Create an alloc with nvidia 256 a1 := nvidiaAlloc() 257 a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} 258 require.False(d.AddAllocs([]*Allocation{a1})) 259 260 // Reserve the same device 261 res := nvidiaAllocatedDevice() 262 res.DeviceIDs = []string{nvidiaDev0ID} 263 require.True(d.AddReserved(res)) 264 }