github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/csi/ebs.go (about) 1 package csi 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/hashicorp/nomad/e2e/e2eutil" 8 e2e "github.com/hashicorp/nomad/e2e/e2eutil" 9 "github.com/hashicorp/nomad/e2e/framework" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/testutil" 12 ) 13 14 // CSIControllerPluginEBSTest exercises the AWS EBS plugin, which is an 15 // example of a plugin that supports most of the CSI Controller RPCs. 16 type CSIControllerPluginEBSTest struct { 17 framework.TC 18 uuid string 19 testJobIDs []string 20 volumeIDs []string 21 pluginJobIDs []string 22 nodeIDs []string 23 } 24 25 const ebsPluginID = "aws-ebs0" 26 27 // BeforeAll waits for the cluster to be ready, deploys the CSI plugins, and 28 // creates two EBS volumes for use in the test. 29 func (tc *CSIControllerPluginEBSTest) BeforeAll(f *framework.F) { 30 e2eutil.WaitForLeader(f.T(), tc.Nomad()) 31 e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2) 32 33 tc.uuid = uuid.Generate()[0:8] 34 35 // deploy the controller plugin job 36 controllerJobID := "aws-ebs-plugin-controller-" + tc.uuid 37 f.NoError(e2eutil.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad")) 38 tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID) 39 40 f.NoError(e2e.WaitForAllocStatusComparison( 41 func() ([]string, error) { return e2e.AllocStatuses(controllerJobID, ns) }, 42 func(got []string) bool { 43 if len(got) != 2 { 44 return false 45 } 46 for _, status := range got { 47 if status != "running" { 48 return false 49 } 50 } 51 return true 52 }, pluginAllocWait, 53 ), "plugin job should be running") 54 55 // deploy the node plugins job 56 nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid 57 f.NoError(e2eutil.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad")) 58 tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID) 59 60 f.NoError(e2eutil.WaitForAllocStatusComparison( 61 func() ([]string, error) { return e2eutil.AllocStatuses(nodesJobID, ns) }, 62 func(got []string) bool { 63 for _, status := range got { 64 if status != "running" { 65 return false 66 } 67 } 68 return true 69 }, nil, 70 )) 71 72 f.NoError(waitForPluginStatusControllerCount(ebsPluginID, 2, pluginWait), 73 "aws-ebs0 controller plugins did not become healthy") 74 f.NoError(waitForPluginStatusMinNodeCount(ebsPluginID, 2, pluginWait), 75 "aws-ebs0 node plugins did not become healthy") 76 77 // ideally we'd wait until after we check `nomad volume status -verbose` 78 // to verify these volumes are ready, but the plugin doesn't support the 79 // CSI ListVolumes RPC 80 volID := "ebs-vol[0]" 81 err := volumeRegister(volID, "csi/input/ebs-volume0.hcl", "create") 82 requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs) 83 tc.volumeIDs = append(tc.volumeIDs, volID) 84 85 volID = "ebs-vol[1]" 86 err = volumeRegister(volID, "csi/input/ebs-volume1.hcl", "create") 87 requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs) 88 tc.volumeIDs = append(tc.volumeIDs, volID) 89 } 90 91 func (tc *CSIControllerPluginEBSTest) AfterEach(f *framework.F) { 92 93 // Ensure nodes are all restored 94 for _, id := range tc.nodeIDs { 95 _, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", id) 96 f.Assert().NoError(err) 97 _, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", id) 98 f.Assert().NoError(err) 99 } 100 tc.nodeIDs = []string{} 101 102 // Stop all jobs in test 103 for _, id := range tc.testJobIDs { 104 err := e2eutil.StopJob(id, "-purge") 105 f.Assert().NoError(err) 106 } 107 tc.testJobIDs = []string{} 108 109 // Garbage collect 110 out, err := e2eutil.Command("nomad", "system", "gc") 111 f.Assert().NoError(err, out) 112 } 113 114 // AfterAll cleans up the volumes and plugin jobs created by the test. 115 func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) { 116 117 for _, volID := range tc.volumeIDs { 118 err := waitForVolumeClaimRelease(volID, reapWait) 119 f.Assert().NoError(err, "volume claims were not released") 120 121 out, err := e2eutil.Command("nomad", "volume", "delete", volID) 122 assertNoErrorElseDump(f, err, 123 fmt.Sprintf("could not delete volume:\n%v", out), tc.pluginJobIDs) 124 } 125 126 // Deregister all plugin jobs in test 127 for _, id := range tc.pluginJobIDs { 128 err := e2eutil.StopJob(id, "-purge") 129 f.Assert().NoError(err) 130 } 131 tc.pluginJobIDs = []string{} 132 133 // Garbage collect 134 out, err := e2eutil.Command("nomad", "system", "gc") 135 f.Assert().NoError(err, out) 136 137 } 138 139 // TestVolumeClaim exercises the volume publish/unpublish workflows for the 140 // EBS plugin. 141 func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) { 142 nomadClient := tc.Nomad() 143 144 // deploy a job that writes to the volume 145 writeJobID := "write-ebs-" + tc.uuid 146 f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad")) 147 f.NoError( 148 e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}), 149 "job should be running") 150 151 allocs, err := e2eutil.AllocsForJob(writeJobID, ns) 152 f.NoError(err, "could not get allocs for write job") 153 f.Len(allocs, 1, "could not get allocs for write job") 154 writeAllocID := allocs[0]["ID"] 155 156 // read data from volume and assert the writer wrote a file to it 157 expectedPath := "/task/test/" + writeAllocID 158 _, err = readFile(nomadClient, writeAllocID, expectedPath) 159 f.NoError(err) 160 161 // Shutdown (and purge) the writer so we can run a reader. 162 // we could mount the EBS volume with multi-attach, but we 163 // want this test to exercise the unpublish workflow. 164 err = e2eutil.StopJob(writeJobID, "-purge") 165 f.NoError(err) 166 167 // wait for the volume unpublish workflow to complete 168 for _, volID := range tc.volumeIDs { 169 err := waitForVolumeClaimRelease(volID, reapWait) 170 f.NoError(err, "volume claims were not released") 171 } 172 173 // deploy a job so we can read from the volume 174 readJobID := "read-ebs-" + tc.uuid 175 tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up 176 f.NoError(e2eutil.Register(readJobID, "csi/input/use-ebs-volume.nomad")) 177 f.NoError( 178 e2eutil.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}), 179 "job should be running") 180 181 allocs, err = e2eutil.AllocsForJob(readJobID, ns) 182 f.NoError(err, "could not get allocs for read job") 183 f.Len(allocs, 1, "could not get allocs for read job") 184 readAllocID := allocs[0]["ID"] 185 186 // read data from volume and assert we can read the file the writer wrote 187 expectedPath = "/task/test/" + readAllocID 188 _, err = readFile(nomadClient, readAllocID, expectedPath) 189 f.NoError(err) 190 } 191 192 // TestSnapshot exercises the snapshot commands. 193 func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) { 194 195 out, err := e2eutil.Command("nomad", "volume", "snapshot", "create", 196 tc.volumeIDs[0], "snap-"+tc.uuid) 197 requireNoErrorElseDump(f, err, "could not create volume snapshot", tc.pluginJobIDs) 198 199 snaps, err := e2eutil.ParseColumns(out) 200 201 defer func() { 202 _, err := e2eutil.Command("nomad", "volume", "snapshot", "delete", 203 ebsPluginID, snaps[0]["Snapshot ID"]) 204 requireNoErrorElseDump(f, err, "could not delete volume snapshot", tc.pluginJobIDs) 205 }() 206 207 f.NoError(err, fmt.Sprintf("could not parse output:\n%v", out)) 208 f.Len(snaps, 1, fmt.Sprintf("could not parse output:\n%v", out)) 209 210 // the snapshot we're looking for should be the first one because 211 // we just created it, but give us some breathing room to allow 212 // for concurrent test runs 213 out, err = e2eutil.Command("nomad", "volume", "snapshot", "list", 214 "-plugin", ebsPluginID, "-per-page", "10") 215 requireNoErrorElseDump(f, err, "could not list volume snapshots", tc.pluginJobIDs) 216 f.Contains(out, snaps[0]["ID"], 217 fmt.Sprintf("volume snapshot list did not include expected snapshot:\n%v", out)) 218 } 219 220 // TestNodeDrain exercises the remounting behavior in the face of a node drain 221 func (tc *CSIControllerPluginEBSTest) TestNodeDrain(f *framework.F) { 222 223 nomadClient := tc.Nomad() 224 225 nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid 226 pluginAllocs, err := e2eutil.AllocsForJob(nodesJobID, ns) 227 f.NoError(err) 228 expectedHealthyNodePlugins := len(pluginAllocs) 229 230 // deploy a job that writes to the volume 231 writeJobID := "write-ebs-for-drain" + tc.uuid 232 f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad")) 233 f.NoError( 234 e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}), 235 "job should be running") 236 tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up 237 238 allocs, err := e2eutil.AllocsForJob(writeJobID, ns) 239 f.NoError(err, "could not get allocs for write job") 240 f.Len(allocs, 1, "could not get allocs for write job") 241 writeAllocID := allocs[0]["ID"] 242 243 // read data from volume and assert the writer wrote a file to it 244 expectedPath := "/task/test/" + writeAllocID 245 _, err = readFile(nomadClient, writeAllocID, expectedPath) 246 f.NoError(err) 247 248 // intentionally set a long deadline so we can check the plugins 249 // haven't been moved 250 nodeID := allocs[0]["Node ID"] 251 out, err := e2eutil.Command("nomad", "node", 252 "drain", "-enable", 253 "-deadline", "10m", 254 "-yes", "-detach", nodeID) 255 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 256 tc.nodeIDs = append(tc.nodeIDs, nodeID) 257 258 wc := &e2eutil.WaitConfig{} 259 interval, retries := wc.OrDefault() 260 testutil.WaitForResultRetries(retries, func() (bool, error) { 261 time.Sleep(interval) 262 allocs, err := e2eutil.AllocsForJob(writeJobID, ns) 263 if err != nil { 264 return false, err 265 } 266 for _, alloc := range allocs { 267 if alloc["ID"] != writeAllocID { 268 if alloc["Status"] == "running" { 269 return true, nil 270 } 271 if alloc["Status"] == "failed" { 272 // no point in waiting anymore if we hit this case 273 f.T().Fatal("expected replacement alloc not to fail") 274 } 275 } 276 } 277 return false, fmt.Errorf("expected replacement alloc to be running") 278 }, func(e error) { 279 err = e 280 }) 281 282 pluginAllocs, err = e2eutil.AllocsForJob(nodesJobID, ns) 283 f.Lenf(pluginAllocs, expectedHealthyNodePlugins, 284 "expected node plugins to be unchanged, got: %v", pluginAllocs) 285 }