github.com/hernad/nomad@v1.6.112/e2e/csi/ebs.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package csi
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/hernad/nomad/e2e/e2eutil"
    11  	e2e "github.com/hernad/nomad/e2e/e2eutil"
    12  	"github.com/hernad/nomad/e2e/framework"
    13  	"github.com/hernad/nomad/helper/uuid"
    14  	"github.com/hernad/nomad/testutil"
    15  )
    16  
    17  // CSIControllerPluginEBSTest exercises the AWS EBS plugin, which is an
    18  // example of a plugin that supports most of the CSI Controller RPCs.
    19  type CSIControllerPluginEBSTest struct {
    20  	framework.TC
    21  	uuid         string
    22  	testJobIDs   []string
    23  	volumeIDs    []string
    24  	pluginJobIDs []string
    25  	nodeIDs      []string
    26  }
    27  
    28  const ebsPluginID = "aws-ebs0"
    29  
    30  // BeforeAll waits for the cluster to be ready, deploys the CSI plugins, and
    31  // creates two EBS volumes for use in the test.
    32  func (tc *CSIControllerPluginEBSTest) BeforeAll(f *framework.F) {
    33  	e2eutil.WaitForLeader(f.T(), tc.Nomad())
    34  	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)
    35  
    36  	tc.uuid = uuid.Generate()[0:8]
    37  
    38  	// deploy the controller plugin job
    39  	controllerJobID := "aws-ebs-plugin-controller-" + tc.uuid
    40  	f.NoError(e2eutil.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
    41  	tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
    42  
    43  	f.NoError(e2e.WaitForAllocStatusComparison(
    44  		func() ([]string, error) { return e2e.AllocStatuses(controllerJobID, ns) },
    45  		func(got []string) bool {
    46  			if len(got) != 2 {
    47  				return false
    48  			}
    49  			for _, status := range got {
    50  				if status != "running" {
    51  					return false
    52  				}
    53  			}
    54  			return true
    55  		}, pluginAllocWait,
    56  	), "plugin job should be running")
    57  
    58  	// deploy the node plugins job
    59  	nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
    60  	f.NoError(e2eutil.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
    61  	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
    62  
    63  	f.NoError(e2eutil.WaitForAllocStatusComparison(
    64  		func() ([]string, error) { return e2eutil.AllocStatuses(nodesJobID, ns) },
    65  		func(got []string) bool {
    66  			for _, status := range got {
    67  				if status != "running" {
    68  					return false
    69  				}
    70  			}
    71  			return true
    72  		}, nil,
    73  	))
    74  
    75  	f.NoError(waitForPluginStatusControllerCount(ebsPluginID, 2, pluginWait),
    76  		"aws-ebs0 controller plugins did not become healthy")
    77  	f.NoError(waitForPluginStatusMinNodeCount(ebsPluginID, 2, pluginWait),
    78  		"aws-ebs0 node plugins did not become healthy")
    79  
    80  	// ideally we'd wait until after we check `nomad volume status -verbose`
    81  	// to verify these volumes are ready, but the plugin doesn't support the
    82  	// CSI ListVolumes RPC
    83  	volID := "ebs-vol[0]"
    84  	err := volumeRegister(volID, "csi/input/ebs-volume0.hcl", "create")
    85  	requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs)
    86  	tc.volumeIDs = append(tc.volumeIDs, volID)
    87  
    88  	volID = "ebs-vol[1]"
    89  	err = volumeRegister(volID, "csi/input/ebs-volume1.hcl", "create")
    90  	requireNoErrorElseDump(f, err, "could not create volume", tc.pluginJobIDs)
    91  	tc.volumeIDs = append(tc.volumeIDs, volID)
    92  }
    93  
    94  func (tc *CSIControllerPluginEBSTest) AfterEach(f *framework.F) {
    95  
    96  	// Ensure nodes are all restored
    97  	for _, id := range tc.nodeIDs {
    98  		_, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", id)
    99  		f.Assert().NoError(err)
   100  		_, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", id)
   101  		f.Assert().NoError(err)
   102  	}
   103  	tc.nodeIDs = []string{}
   104  
   105  	// Stop all jobs in test
   106  	for _, id := range tc.testJobIDs {
   107  		err := e2eutil.StopJob(id, "-purge")
   108  		f.Assert().NoError(err)
   109  	}
   110  	tc.testJobIDs = []string{}
   111  
   112  	// Garbage collect
   113  	out, err := e2eutil.Command("nomad", "system", "gc")
   114  	f.Assert().NoError(err, out)
   115  }
   116  
   117  // AfterAll cleans up the volumes and plugin jobs created by the test.
   118  func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) {
   119  
   120  	for _, volID := range tc.volumeIDs {
   121  		err := waitForVolumeClaimRelease(volID, reapWait)
   122  		f.Assert().NoError(err, "volume claims were not released")
   123  
   124  		out, err := e2eutil.Command("nomad", "volume", "delete", volID)
   125  		assertNoErrorElseDump(f, err,
   126  			fmt.Sprintf("could not delete volume:\n%v", out), tc.pluginJobIDs)
   127  	}
   128  
   129  	// Deregister all plugin jobs in test
   130  	for _, id := range tc.pluginJobIDs {
   131  		err := e2eutil.StopJob(id, "-purge")
   132  		f.Assert().NoError(err)
   133  	}
   134  	tc.pluginJobIDs = []string{}
   135  
   136  	// Garbage collect
   137  	out, err := e2eutil.Command("nomad", "system", "gc")
   138  	f.Assert().NoError(err, out)
   139  
   140  }
   141  
   142  // TestVolumeClaim exercises the volume publish/unpublish workflows for the
   143  // EBS plugin.
   144  func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {
   145  	nomadClient := tc.Nomad()
   146  
   147  	// deploy a job that writes to the volume
   148  	writeJobID := "write-ebs-" + tc.uuid
   149  	f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
   150  	f.NoError(
   151  		e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
   152  		"job should be running")
   153  
   154  	allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
   155  	f.NoError(err, "could not get allocs for write job")
   156  	f.Len(allocs, 1, "could not get allocs for write job")
   157  	writeAllocID := allocs[0]["ID"]
   158  
   159  	// read data from volume and assert the writer wrote a file to it
   160  	expectedPath := "/task/test/" + writeAllocID
   161  	_, err = readFile(nomadClient, writeAllocID, expectedPath)
   162  	f.NoError(err)
   163  
   164  	// Shutdown (and purge) the writer so we can run a reader.
   165  	// we could mount the EBS volume with multi-attach, but we
   166  	// want this test to exercise the unpublish workflow.
   167  	err = e2eutil.StopJob(writeJobID, "-purge")
   168  	f.NoError(err)
   169  
   170  	// wait for the volume unpublish workflow to complete
   171  	for _, volID := range tc.volumeIDs {
   172  		err := waitForVolumeClaimRelease(volID, reapWait)
   173  		f.NoError(err, "volume claims were not released")
   174  	}
   175  
   176  	// deploy a job so we can read from the volume
   177  	readJobID := "read-ebs-" + tc.uuid
   178  	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
   179  	f.NoError(e2eutil.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
   180  	f.NoError(
   181  		e2eutil.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
   182  		"job should be running")
   183  
   184  	allocs, err = e2eutil.AllocsForJob(readJobID, ns)
   185  	f.NoError(err, "could not get allocs for read job")
   186  	f.Len(allocs, 1, "could not get allocs for read job")
   187  	readAllocID := allocs[0]["ID"]
   188  
   189  	// read data from volume and assert we can read the file the writer wrote
   190  	expectedPath = "/task/test/" + readAllocID
   191  	_, err = readFile(nomadClient, readAllocID, expectedPath)
   192  	f.NoError(err)
   193  }
   194  
   195  // TestSnapshot exercises the snapshot commands.
   196  func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) {
   197  
   198  	out, err := e2eutil.Command("nomad", "volume", "snapshot", "create",
   199  		tc.volumeIDs[0], "snap-"+tc.uuid)
   200  	requireNoErrorElseDump(f, err, "could not create volume snapshot", tc.pluginJobIDs)
   201  
   202  	snaps, err := e2eutil.ParseColumns(out)
   203  
   204  	defer func() {
   205  		_, err := e2eutil.Command("nomad", "volume", "snapshot", "delete",
   206  			ebsPluginID, snaps[0]["Snapshot ID"])
   207  		requireNoErrorElseDump(f, err, "could not delete volume snapshot", tc.pluginJobIDs)
   208  	}()
   209  
   210  	f.NoError(err, fmt.Sprintf("could not parse output:\n%v", out))
   211  	f.Len(snaps, 1, fmt.Sprintf("could not parse output:\n%v", out))
   212  
   213  	// the snapshot we're looking for should be the first one because
   214  	// we just created it, but give us some breathing room to allow
   215  	// for concurrent test runs
   216  	out, err = e2eutil.Command("nomad", "volume", "snapshot", "list",
   217  		"-plugin", ebsPluginID, "-per-page", "10")
   218  	requireNoErrorElseDump(f, err, "could not list volume snapshots", tc.pluginJobIDs)
   219  	f.Contains(out, snaps[0]["ID"],
   220  		fmt.Sprintf("volume snapshot list did not include expected snapshot:\n%v", out))
   221  }
   222  
   223  // TestNodeDrain exercises the remounting behavior in the face of a node drain
   224  func (tc *CSIControllerPluginEBSTest) TestNodeDrain(f *framework.F) {
   225  
   226  	nomadClient := tc.Nomad()
   227  
   228  	nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
   229  	pluginAllocs, err := e2eutil.AllocsForJob(nodesJobID, ns)
   230  	f.NoError(err)
   231  	expectedHealthyNodePlugins := len(pluginAllocs)
   232  
   233  	// deploy a job that writes to the volume
   234  	writeJobID := "write-ebs-for-drain" + tc.uuid
   235  	f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
   236  	f.NoError(
   237  		e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
   238  		"job should be running")
   239  	tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
   240  
   241  	allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
   242  	f.NoError(err, "could not get allocs for write job")
   243  	f.Len(allocs, 1, "could not get allocs for write job")
   244  	writeAllocID := allocs[0]["ID"]
   245  
   246  	// read data from volume and assert the writer wrote a file to it
   247  	expectedPath := "/task/test/" + writeAllocID
   248  	_, err = readFile(nomadClient, writeAllocID, expectedPath)
   249  	f.NoError(err)
   250  
   251  	// intentionally set a long deadline so we can check the plugins
   252  	// haven't been moved
   253  	nodeID := allocs[0]["Node ID"]
   254  	out, err := e2eutil.Command("nomad", "node",
   255  		"drain", "-enable",
   256  		"-deadline", "10m",
   257  		"-yes", "-detach", nodeID)
   258  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   259  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   260  
   261  	wc := &e2eutil.WaitConfig{}
   262  	interval, retries := wc.OrDefault()
   263  	testutil.WaitForResultRetries(retries, func() (bool, error) {
   264  		time.Sleep(interval)
   265  		allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
   266  		if err != nil {
   267  			return false, err
   268  		}
   269  		for _, alloc := range allocs {
   270  			if alloc["ID"] != writeAllocID {
   271  				if alloc["Status"] == "running" {
   272  					return true, nil
   273  				}
   274  				if alloc["Status"] == "failed" {
   275  					// no point in waiting anymore if we hit this case
   276  					f.T().Fatal("expected replacement alloc not to fail")
   277  				}
   278  			}
   279  		}
   280  		return false, fmt.Errorf("expected replacement alloc to be running")
   281  	}, func(e error) {
   282  		err = e
   283  	})
   284  
   285  	pluginAllocs, err = e2eutil.AllocsForJob(nodesJobID, ns)
   286  	f.Lenf(pluginAllocs, expectedHealthyNodePlugins,
   287  		"expected node plugins to be unchanged, got: %v", pluginAllocs)
   288  }