github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/csi/ebs.go (about)

     1  package csi
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/e2e/e2eutil"
     8  	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
     9  	"github.com/hashicorp/nomad/e2e/framework"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/testutil"
    12  )
    13  
    14  // CSIControllerPluginEBSTest exercises the AWS EBS plugin, which is an
    15  // example of a plugin that supports most of the CSI Controller RPCs.
    16  type CSIControllerPluginEBSTest struct {
    17  	framework.TC
    18  	uuid         string
    19  	testJobIDs   []string
    20  	volumeIDs    []string
    21  	pluginJobIDs []string
    22  	nodeIDs      []string
    23  }
    24  
    25  const ebsPluginID = "aws-ebs0"
    26  
    27  // BeforeAll waits for the cluster to be ready, deploys the CSI plugins, and
    28  // creates two EBS volumes for use in the test.
    29  func (tc *CSIControllerPluginEBSTest) BeforeAll(f *framework.F) {
    30  	e2eutil.WaitForLeader(f.T(), tc.Nomad())
    31  	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)
    32  
    33  	tc.uuid = uuid.Generate()[0:8]
    34  
    35  	// deploy the controller plugin job
    36  	controllerJobID := "aws-ebs-plugin-controller-" + tc.uuid
    37  	f.NoError(e2eutil.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
    38  	tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
    39  
    40  	f.NoError(e2e.WaitForAllocStatusComparison(
    41  		func() ([]string, error) { return e2e.AllocStatuses(controllerJobID, ns) },
    42  		func(got []string) bool {
    43  			if len(got) != 2 {
    44  				return false
    45  			}
    46  			for _, status := range got {
    47  				if status != "running" {
    48  					return false
    49  				}
    50  			}
    51  			return true
    52  		}, pluginAllocWait,
    53  	), "plugin job should be running")
    54  
    55  	// deploy the node plugins job
    56  	nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
    57  	f.NoError(e2eutil.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
    58  	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
    59  
    60  	f.NoError(e2eutil.WaitForAllocStatusComparison(
    61  		func() ([]string, error) { return e2eutil.AllocStatuses(nodesJobID, ns) },
    62  		func(got []string) bool {
    63  			for _, status := range got {
    64  				if status != "running" {
    65  					return false
    66  				}
    67  			}
    68  			return true
    69  		}, nil,
    70  	))
    71  
    72  	f.NoError(waitForPluginStatusControllerCount(ebsPluginID, 2, pluginWait),
    73  		"aws-ebs0 controller plugins did not become healthy")
    74  	f.NoError(waitForPluginStatusMinNodeCount(ebsPluginID, 2, pluginWait),
    75  		"aws-ebs0 node plugins did not become healthy")
    76  
    77  	// ideally we'd wait until after we check `nomad volume status -verbose`
    78  	// to verify these volumes are ready, but the plugin doesn't support the
    79  	// CSI ListVolumes RPC
    80  	volID := "ebs-vol[0]"
    81  	err := volumeRegister(volID, "csi/input/ebs-volume0.hcl", "create")
    82  	requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs)
    83  	tc.volumeIDs = append(tc.volumeIDs, volID)
    84  
    85  	volID = "ebs-vol[1]"
    86  	err = volumeRegister(volID, "csi/input/ebs-volume1.hcl", "create")
    87  	requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs)
    88  	tc.volumeIDs = append(tc.volumeIDs, volID)
    89  }
    90  
    91  func (tc *CSIControllerPluginEBSTest) AfterEach(f *framework.F) {
    92  
    93  	// Ensure nodes are all restored
    94  	for _, id := range tc.nodeIDs {
    95  		_, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", id)
    96  		f.Assert().NoError(err)
    97  		_, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", id)
    98  		f.Assert().NoError(err)
    99  	}
   100  	tc.nodeIDs = []string{}
   101  
   102  	// Stop all jobs in test
   103  	for _, id := range tc.testJobIDs {
   104  		err := e2eutil.StopJob(id, "-purge")
   105  		f.Assert().NoError(err)
   106  	}
   107  	tc.testJobIDs = []string{}
   108  
   109  	// Garbage collect
   110  	out, err := e2eutil.Command("nomad", "system", "gc")
   111  	f.Assert().NoError(err, out)
   112  }
   113  
   114  // AfterAll cleans up the volumes and plugin jobs created by the test.
   115  func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) {
   116  
   117  	for _, volID := range tc.volumeIDs {
   118  		err := waitForVolumeClaimRelease(volID, reapWait)
   119  		f.Assert().NoError(err, "volume claims were not released")
   120  
   121  		out, err := e2eutil.Command("nomad", "volume", "delete", volID)
   122  		assertNoErrorElseDump(f, err,
   123  			fmt.Sprintf("could not delete volume:\n%v", out), tc.pluginJobIDs)
   124  	}
   125  
   126  	// Deregister all plugin jobs in test
   127  	for _, id := range tc.pluginJobIDs {
   128  		err := e2eutil.StopJob(id, "-purge")
   129  		f.Assert().NoError(err)
   130  	}
   131  	tc.pluginJobIDs = []string{}
   132  
   133  	// Garbage collect
   134  	out, err := e2eutil.Command("nomad", "system", "gc")
   135  	f.Assert().NoError(err, out)
   136  
   137  }
   138  
   139  // TestVolumeClaim exercises the volume publish/unpublish workflows for the
   140  // EBS plugin.
   141  func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {
   142  	nomadClient := tc.Nomad()
   143  
   144  	// deploy a job that writes to the volume
   145  	writeJobID := "write-ebs-" + tc.uuid
   146  	f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
   147  	f.NoError(
   148  		e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
   149  		"job should be running")
   150  
   151  	allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
   152  	f.NoError(err, "could not get allocs for write job")
   153  	f.Len(allocs, 1, "could not get allocs for write job")
   154  	writeAllocID := allocs[0]["ID"]
   155  
   156  	// read data from volume and assert the writer wrote a file to it
   157  	expectedPath := "/task/test/" + writeAllocID
   158  	_, err = readFile(nomadClient, writeAllocID, expectedPath)
   159  	f.NoError(err)
   160  
   161  	// Shutdown (and purge) the writer so we can run a reader.
   162  	// we could mount the EBS volume with multi-attach, but we
   163  	// want this test to exercise the unpublish workflow.
   164  	err = e2eutil.StopJob(writeJobID, "-purge")
   165  	f.NoError(err)
   166  
   167  	// wait for the volume unpublish workflow to complete
   168  	for _, volID := range tc.volumeIDs {
   169  		err := waitForVolumeClaimRelease(volID, reapWait)
   170  		f.NoError(err, "volume claims were not released")
   171  	}
   172  
   173  	// deploy a job so we can read from the volume
   174  	readJobID := "read-ebs-" + tc.uuid
   175  	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
   176  	f.NoError(e2eutil.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
   177  	f.NoError(
   178  		e2eutil.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
   179  		"job should be running")
   180  
   181  	allocs, err = e2eutil.AllocsForJob(readJobID, ns)
   182  	f.NoError(err, "could not get allocs for read job")
   183  	f.Len(allocs, 1, "could not get allocs for read job")
   184  	readAllocID := allocs[0]["ID"]
   185  
   186  	// read data from volume and assert we can read the file the writer wrote
   187  	expectedPath = "/task/test/" + readAllocID
   188  	_, err = readFile(nomadClient, readAllocID, expectedPath)
   189  	f.NoError(err)
   190  }
   191  
   192  // TestSnapshot exercises the snapshot commands.
   193  func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) {
   194  
   195  	out, err := e2eutil.Command("nomad", "volume", "snapshot", "create",
   196  		tc.volumeIDs[0], "snap-"+tc.uuid)
   197  	requireNoErrorElseDump(f, err, "could not create volume snapshot", tc.pluginJobIDs)
   198  
   199  	snaps, err := e2eutil.ParseColumns(out)
   200  
   201  	defer func() {
   202  		_, err := e2eutil.Command("nomad", "volume", "snapshot", "delete",
   203  			ebsPluginID, snaps[0]["Snapshot ID"])
   204  		requireNoErrorElseDump(f, err, "could not delete volume snapshot", tc.pluginJobIDs)
   205  	}()
   206  
   207  	f.NoError(err, fmt.Sprintf("could not parse output:\n%v", out))
   208  	f.Len(snaps, 1, fmt.Sprintf("could not parse output:\n%v", out))
   209  
   210  	// the snapshot we're looking for should be the first one because
   211  	// we just created it, but give us some breathing room to allow
   212  	// for concurrent test runs
   213  	out, err = e2eutil.Command("nomad", "volume", "snapshot", "list",
   214  		"-plugin", ebsPluginID, "-per-page", "10")
   215  	requireNoErrorElseDump(f, err, "could not list volume snapshots", tc.pluginJobIDs)
   216  	f.Contains(out, snaps[0]["ID"],
   217  		fmt.Sprintf("volume snapshot list did not include expected snapshot:\n%v", out))
   218  }
   219  
   220  // TestNodeDrain exercises the remounting behavior in the face of a node drain
   221  func (tc *CSIControllerPluginEBSTest) TestNodeDrain(f *framework.F) {
   222  
   223  	nomadClient := tc.Nomad()
   224  
   225  	nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
   226  	pluginAllocs, err := e2eutil.AllocsForJob(nodesJobID, ns)
   227  	f.NoError(err)
   228  	expectedHealthyNodePlugins := len(pluginAllocs)
   229  
   230  	// deploy a job that writes to the volume
   231  	writeJobID := "write-ebs-for-drain" + tc.uuid
   232  	f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
   233  	f.NoError(
   234  		e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
   235  		"job should be running")
   236  	tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
   237  
   238  	allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
   239  	f.NoError(err, "could not get allocs for write job")
   240  	f.Len(allocs, 1, "could not get allocs for write job")
   241  	writeAllocID := allocs[0]["ID"]
   242  
   243  	// read data from volume and assert the writer wrote a file to it
   244  	expectedPath := "/task/test/" + writeAllocID
   245  	_, err = readFile(nomadClient, writeAllocID, expectedPath)
   246  	f.NoError(err)
   247  
   248  	// intentionally set a long deadline so we can check the plugins
   249  	// haven't been moved
   250  	nodeID := allocs[0]["Node ID"]
   251  	out, err := e2eutil.Command("nomad", "node",
   252  		"drain", "-enable",
   253  		"-deadline", "10m",
   254  		"-yes", "-detach", nodeID)
   255  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   256  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   257  
   258  	wc := &e2eutil.WaitConfig{}
   259  	interval, retries := wc.OrDefault()
   260  	testutil.WaitForResultRetries(retries, func() (bool, error) {
   261  		time.Sleep(interval)
   262  		allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
   263  		if err != nil {
   264  			return false, err
   265  		}
   266  		for _, alloc := range allocs {
   267  			if alloc["ID"] != writeAllocID {
   268  				if alloc["Status"] == "running" {
   269  					return true, nil
   270  				}
   271  				if alloc["Status"] == "failed" {
   272  					// no point in waiting anymore if we hit this case
   273  					f.T().Fatal("expected replacement alloc not to fail")
   274  				}
   275  			}
   276  		}
   277  		return false, fmt.Errorf("expected replacement alloc to be running")
   278  	}, func(e error) {
   279  		err = e
   280  	})
   281  
   282  	pluginAllocs, err = e2eutil.AllocsForJob(nodesJobID, ns)
   283  	f.Lenf(pluginAllocs, expectedHealthyNodePlugins,
   284  		"expected node plugins to be unchanged, got: %v", pluginAllocs)
   285  }