github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/e2e/csi/csi.go (about)

     1  package csi
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"os"
    10  	"os/exec"
    11  	"regexp"
    12  	"strconv"
    13  	"strings"
    14  	"time"
    15  
    16  	"github.com/stretchr/testify/require"
    17  
    18  	"github.com/hashicorp/nomad/api"
    19  	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
    20  	"github.com/hashicorp/nomad/e2e/framework"
    21  	"github.com/hashicorp/nomad/helper/uuid"
    22  	"github.com/hashicorp/nomad/testutil"
    23  )
    24  
    25  type CSIVolumesTest struct {
    26  	framework.TC
    27  	testJobIDs   []string
    28  	volumeIDs    []string
    29  	pluginJobIDs []string
    30  }
    31  
    32  func init() {
    33  	framework.AddSuites(&framework.TestSuite{
    34  		Component:   "CSI",
    35  		CanRunLocal: true,
    36  		Consul:      false,
    37  		Cases: []framework.TestCase{
    38  			new(CSIVolumesTest),
    39  		},
    40  	})
    41  }
    42  
    43  const ns = ""
    44  
    45  var pluginWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36} // 3min
    46  var reapWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36}   // 3min
    47  
    48  func (tc *CSIVolumesTest) BeforeAll(f *framework.F) {
    49  	t := f.T()
    50  
    51  	_, err := os.Stat("csi/input/volume-ebs.hcl")
    52  	if err != nil {
    53  		t.Skip("skipping CSI test because EBS volume spec file missing:", err)
    54  	}
    55  
    56  	_, err = os.Stat("csi/input/volume-efs.hcl")
    57  	if err != nil {
    58  		t.Skip("skipping CSI test because EFS volume spec file missing:", err)
    59  	}
    60  
    61  	// Ensure cluster has leader and at least two client
    62  	// nodes in a ready state before running tests
    63  	e2e.WaitForLeader(t, tc.Nomad())
    64  	e2e.WaitForNodesReady(t, tc.Nomad(), 2)
    65  }
    66  
    67  // TestEBSVolumeClaim launches AWS EBS plugins and registers an EBS volume
    68  // as a Nomad CSI volume. We then deploy a job that writes to the volume,
    69  // stop that job, and reuse the volume for another job which should be able
    70  // to read the data written by the first job.
    71  func (tc *CSIVolumesTest) TestEBSVolumeClaim(f *framework.F) {
    72  	t := f.T()
    73  	require := require.New(t)
    74  	nomadClient := tc.Nomad()
    75  	uuid := uuid.Generate()
    76  	pluginID := "aws-ebs0"
    77  
    78  	// deploy the controller plugin job
    79  	controllerJobID := "aws-ebs-plugin-controller-" + uuid[0:8]
    80  	f.NoError(e2e.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
    81  	tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
    82  	expected := []string{"running", "running"}
    83  	f.NoError(
    84  		e2e.WaitForAllocStatusExpected(controllerJobID, ns, expected),
    85  		"job should be running")
    86  
    87  	// deploy the node plugins job
    88  	nodesJobID := "aws-ebs-plugin-nodes-" + uuid[0:8]
    89  	f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
    90  	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
    91  
    92  	f.NoError(e2e.WaitForAllocStatusComparison(
    93  		func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
    94  		func(got []string) bool {
    95  			for _, status := range got {
    96  				if status != "running" {
    97  					return false
    98  				}
    99  			}
   100  			return true
   101  		}, nil,
   102  	))
   103  
   104  	f.NoError(waitForPluginStatusControllerCount(pluginID, 2, pluginWait),
   105  		"aws-ebs0 controller plugins did not become healthy")
   106  	f.NoError(waitForPluginStatusMinNodeCount(pluginID, 2, pluginWait),
   107  		"aws-ebs0 node plugins did not become healthy")
   108  
   109  	// register a volume
   110  	// TODO: we don't have a unique ID threaded thru the jobspec yet
   111  	volID := "ebs-vol0"
   112  	err := volumeRegister(volID, "csi/input/volume-ebs.hcl")
   113  	require.NoError(err)
   114  	tc.volumeIDs = append(tc.volumeIDs, volID)
   115  
   116  	// deploy a job that writes to the volume
   117  	writeJobID := "write-ebs-" + uuid[0:8]
   118  	f.NoError(e2e.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
   119  	f.NoError(
   120  		e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
   121  		"job should be running")
   122  
   123  	allocs, err := e2e.AllocsForJob(writeJobID, ns)
   124  	f.NoError(err, "could not get allocs for write job")
   125  	f.Len(allocs, 1, "could not get allocs for write job")
   126  	writeAllocID := allocs[0]["ID"]
   127  
   128  	// read data from volume and assert the writer wrote a file to it
   129  	expectedPath := "/task/test/" + writeAllocID
   130  	_, err = readFile(nomadClient, writeAllocID, expectedPath)
   131  	require.NoError(err)
   132  
   133  	// Shutdown (and purge) the writer so we can run a reader.
   134  	// we could mount the EBS volume with multi-attach, but we
   135  	// want this test to exercise the unpublish workflow.
   136  	_, err = e2e.Command("nomad", "job", "stop", "-purge", writeJobID)
   137  	require.NoError(err)
   138  
   139  	// wait for the volume unpublish workflow to complete
   140  	require.NoError(waitForVolumeClaimRelease(volID, reapWait),
   141  		"write-ebs alloc claim was not released")
   142  
   143  	// deploy a job so we can read from the volume
   144  	readJobID := "read-ebs-" + uuid[0:8]
   145  	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
   146  	f.NoError(e2e.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
   147  	f.NoError(
   148  		e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
   149  		"job should be running")
   150  
   151  	allocs, err = e2e.AllocsForJob(readJobID, ns)
   152  	f.NoError(err, "could not get allocs for read job")
   153  	f.Len(allocs, 1, "could not get allocs for read job")
   154  	readAllocID := allocs[0]["ID"]
   155  
   156  	// read data from volume and assert we can read the file the writer wrote
   157  	expectedPath = "/task/test/" + readAllocID
   158  	_, err = readFile(nomadClient, readAllocID, expectedPath)
   159  	require.NoError(err)
   160  
   161  }
   162  
   163  // TestEFSVolumeClaim launches AWS EFS plugins and registers an EFS volume
   164  // as a Nomad CSI volume. We then deploy a job that writes to the volume,
   165  // and share the volume with another job which should be able to read the
   166  // data written by the first job.
   167  func (tc *CSIVolumesTest) TestEFSVolumeClaim(f *framework.F) {
   168  	t := f.T()
   169  	require := require.New(t)
   170  	nomadClient := tc.Nomad()
   171  	uuid := uuid.Generate()
   172  	pluginID := "aws-efs0"
   173  
   174  	// deploy the node plugins job (no need for a controller for EFS)
   175  	nodesJobID := "aws-efs-plugin-nodes-" + uuid[0:8]
   176  	f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-efs-nodes.nomad"))
   177  	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
   178  
   179  	f.NoError(e2e.WaitForAllocStatusComparison(
   180  		func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
   181  		func(got []string) bool {
   182  			for _, status := range got {
   183  				if status != "running" {
   184  					return false
   185  				}
   186  			}
   187  			return true
   188  		}, nil,
   189  	))
   190  
   191  	f.NoError(waitForPluginStatusMinNodeCount(pluginID, 2, pluginWait),
   192  		"aws-efs0 node plugins did not become healthy")
   193  
   194  	// register a volume
   195  	volID := "efs-vol0"
   196  	err := volumeRegister(volID, "csi/input/volume-efs.hcl")
   197  	require.NoError(err)
   198  	tc.volumeIDs = append(tc.volumeIDs, volID)
   199  
   200  	// deploy a job that writes to the volume
   201  	writeJobID := "write-efs-" + uuid[0:8]
   202  	tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
   203  	f.NoError(e2e.Register(writeJobID, "csi/input/use-efs-volume-write.nomad"))
   204  	f.NoError(
   205  		e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
   206  		"job should be running")
   207  
   208  	allocs, err := e2e.AllocsForJob(writeJobID, ns)
   209  	f.NoError(err, "could not get allocs for write job")
   210  	f.Len(allocs, 1, "could not get allocs for write job")
   211  	writeAllocID := allocs[0]["ID"]
   212  
   213  	// read data from volume and assert the writer wrote a file to it
   214  	expectedPath := "/task/test/" + writeAllocID
   215  	_, err = readFile(nomadClient, writeAllocID, expectedPath)
   216  	require.NoError(err)
   217  
   218  	// Shutdown the writer so we can run a reader.
   219  	// although EFS should support multiple readers, the plugin
   220  	// does not.
   221  	_, err = e2e.Command("nomad", "job", "stop", writeJobID)
   222  	require.NoError(err)
   223  
   224  	// wait for the volume unpublish workflow to complete
   225  	require.NoError(waitForVolumeClaimRelease(volID, reapWait),
   226  		"write-efs alloc claim was not released")
   227  
   228  	// deploy a job that reads from the volume
   229  	readJobID := "read-efs-" + uuid[0:8]
   230  	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
   231  	f.NoError(e2e.Register(readJobID, "csi/input/use-efs-volume-read.nomad"))
   232  	f.NoError(
   233  		e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
   234  		"job should be running")
   235  
   236  	allocs, err = e2e.AllocsForJob(readJobID, ns)
   237  	f.NoError(err, "could not get allocs for read job")
   238  	f.Len(allocs, 1, "could not get allocs for read job")
   239  	readAllocID := allocs[0]["ID"]
   240  
   241  	// read data from volume and assert the writer wrote a file to it
   242  	require.NoError(err)
   243  	_, err = readFile(nomadClient, readAllocID, expectedPath)
   244  	require.NoError(err)
   245  }
   246  
   247  func (tc *CSIVolumesTest) AfterEach(f *framework.F) {
   248  
   249  	// Stop all jobs in test
   250  	for _, id := range tc.testJobIDs {
   251  		out, err := e2e.Command("nomad", "job", "stop", "-purge", id)
   252  		f.Assert().NoError(err, out)
   253  	}
   254  	tc.testJobIDs = []string{}
   255  
   256  	// Deregister all volumes in test
   257  	for _, id := range tc.volumeIDs {
   258  		// make sure all the test jobs have finished unpublishing claims
   259  		err := waitForVolumeClaimRelease(id, reapWait)
   260  		f.Assert().NoError(err, "volume claims were not released")
   261  
   262  		out, err := e2e.Command("nomad", "volume", "deregister", id)
   263  		if err != nil {
   264  			fmt.Println("could not deregister volume, dumping allocation logs")
   265  			f.Assert().NoError(tc.dumpLogs())
   266  		}
   267  		f.Assert().NoError(err, out)
   268  	}
   269  	tc.volumeIDs = []string{}
   270  
   271  	// Deregister all plugin jobs in test
   272  	for _, id := range tc.pluginJobIDs {
   273  		out, err := e2e.Command("nomad", "job", "stop", "-purge", id)
   274  		f.Assert().NoError(err, out)
   275  	}
   276  	tc.pluginJobIDs = []string{}
   277  
   278  	// Garbage collect
   279  	out, err := e2e.Command("nomad", "system", "gc")
   280  	f.Assert().NoError(err, out)
   281  }
   282  
   283  // waitForVolumeClaimRelease makes sure we don't try to re-claim a volume
   284  // that's in the process of being unpublished. we can't just wait for allocs
   285  // to stop, but need to wait for their claims to be released
   286  func waitForVolumeClaimRelease(volID string, wc *e2e.WaitConfig) error {
   287  	var out string
   288  	var err error
   289  	testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
   290  		time.Sleep(wc.Interval)
   291  		out, err = e2e.Command("nomad", "volume", "status", volID)
   292  		if err != nil {
   293  			return false, err
   294  		}
   295  		section, err := e2e.GetSection(out, "Allocations")
   296  		if err != nil {
   297  			return false, err
   298  		}
   299  		return strings.Contains(section, "No allocations placed"), nil
   300  	}, func(e error) {
   301  		if e == nil {
   302  			err = nil
   303  		}
   304  		err = fmt.Errorf("alloc claim was not released: %v\n%s", e, out)
   305  	})
   306  	return err
   307  }
   308  
   309  func (tc *CSIVolumesTest) dumpLogs() error {
   310  
   311  	for _, id := range tc.pluginJobIDs {
   312  		allocs, err := e2e.AllocsForJob(id, ns)
   313  		if err != nil {
   314  			return fmt.Errorf("could not find allocs for plugin: %v", err)
   315  		}
   316  		for _, alloc := range allocs {
   317  			allocID := alloc["ID"]
   318  			out, err := e2e.AllocLogs(allocID, e2e.LogsStdErr)
   319  			if err != nil {
   320  				return fmt.Errorf("could not get logs for alloc: %v\n%s", err, out)
   321  			}
   322  			_, isCI := os.LookupEnv("CI")
   323  			if isCI {
   324  				fmt.Println("--------------------------------------")
   325  				fmt.Println("allocation logs:", allocID)
   326  				fmt.Println(out)
   327  				continue
   328  			}
   329  			f, err := os.Create(allocID + ".log")
   330  			if err != nil {
   331  				return fmt.Errorf("could not create log file: %v", err)
   332  			}
   333  			defer f.Close()
   334  			_, err = f.WriteString(out)
   335  			if err != nil {
   336  				return fmt.Errorf("could not write to log file: %v", err)
   337  			}
   338  			fmt.Printf("nomad alloc logs written to %s.log\n", allocID)
   339  		}
   340  	}
   341  	return nil
   342  }
   343  
   344  // TODO(tgross): replace this w/ AllocFS().Stat() after
   345  // https://github.com/hashicorp/nomad/issues/7365 is fixed
   346  func readFile(client *api.Client, allocID string, path string) (bytes.Buffer, error) {
   347  	var stdout, stderr bytes.Buffer
   348  	alloc, _, err := client.Allocations().Info(allocID, nil)
   349  	if err != nil {
   350  		return stdout, err
   351  	}
   352  	ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second)
   353  	defer cancelFn()
   354  
   355  	_, err = client.Allocations().Exec(ctx,
   356  		alloc, "task", false,
   357  		[]string{"cat", path},
   358  		os.Stdin, &stdout, &stderr,
   359  		make(chan api.TerminalSize), nil)
   360  	return stdout, err
   361  }
   362  
   363  func waitForPluginStatusMinNodeCount(pluginID string, minCount int, wc *e2e.WaitConfig) error {
   364  
   365  	return waitForPluginStatusCompare(pluginID, func(out string) (bool, error) {
   366  		expected, err := e2e.GetField(out, "Nodes Expected")
   367  		if err != nil {
   368  			return false, err
   369  		}
   370  		expectedCount, err := strconv.Atoi(strings.TrimSpace(expected))
   371  		if err != nil {
   372  			return false, err
   373  		}
   374  		if expectedCount < minCount {
   375  			return false, fmt.Errorf(
   376  				"expected Nodes Expected >= %d, got %q", minCount, expected)
   377  		}
   378  		healthy, err := e2e.GetField(out, "Nodes Healthy")
   379  		if err != nil {
   380  			return false, err
   381  		}
   382  		if healthy != expected {
   383  			return false, fmt.Errorf(
   384  				"expected Nodes Healthy >= %d, got %q", minCount, healthy)
   385  		}
   386  		return true, nil
   387  	}, wc)
   388  }
   389  
   390  func waitForPluginStatusControllerCount(pluginID string, count int, wc *e2e.WaitConfig) error {
   391  
   392  	return waitForPluginStatusCompare(pluginID, func(out string) (bool, error) {
   393  
   394  		expected, err := e2e.GetField(out, "Controllers Expected")
   395  		if err != nil {
   396  			return false, err
   397  		}
   398  		expectedCount, err := strconv.Atoi(strings.TrimSpace(expected))
   399  		if err != nil {
   400  			return false, err
   401  		}
   402  		if expectedCount != count {
   403  			return false, fmt.Errorf(
   404  				"expected Controllers Expected = %d, got %d", count, expectedCount)
   405  		}
   406  		healthy, err := e2e.GetField(out, "Controllers Healthy")
   407  		if err != nil {
   408  			return false, err
   409  		}
   410  		healthyCount, err := strconv.Atoi(strings.TrimSpace(healthy))
   411  		if err != nil {
   412  			return false, err
   413  		}
   414  		if healthyCount != count {
   415  			return false, fmt.Errorf(
   416  				"expected Controllers Healthy = %d, got %d", count, healthyCount)
   417  		}
   418  		return true, nil
   419  
   420  	}, wc)
   421  }
   422  
   423  func waitForPluginStatusCompare(pluginID string, compare func(got string) (bool, error), wc *e2e.WaitConfig) error {
   424  	var err error
   425  	testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
   426  		time.Sleep(wc.Interval)
   427  		out, err := e2e.Command("nomad", "plugin", "status", pluginID)
   428  		if err != nil {
   429  			return false, err
   430  		}
   431  		return compare(out)
   432  	}, func(e error) {
   433  		err = fmt.Errorf("plugin status check failed: %v", e)
   434  	})
   435  	return err
   436  }
   437  
   438  // VolumeRegister registers a jobspec from a file but with a unique ID.
   439  // The caller is responsible for recording that ID for later cleanup.
   440  func volumeRegister(volID, volFilePath string) error {
   441  
   442  	cmd := exec.Command("nomad", "volume", "register", "-")
   443  	stdin, err := cmd.StdinPipe()
   444  	if err != nil {
   445  		return fmt.Errorf("could not open stdin?: %w", err)
   446  	}
   447  
   448  	content, err := ioutil.ReadFile(volFilePath)
   449  	if err != nil {
   450  		return fmt.Errorf("could not open vol file: %w", err)
   451  	}
   452  
   453  	// hack off the first line to replace with our unique ID
   454  	var re = regexp.MustCompile(`(?m)^id ".*"`)
   455  	volspec := re.ReplaceAllString(string(content),
   456  		fmt.Sprintf("id = \"%s\"", volID))
   457  
   458  	go func() {
   459  		defer stdin.Close()
   460  		io.WriteString(stdin, volspec)
   461  	}()
   462  
   463  	out, err := cmd.CombinedOutput()
   464  	if err != nil {
   465  		return fmt.Errorf("could not register vol: %w\n%v", err, string(out))
   466  	}
   467  	return nil
   468  }