github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/remotetasks/remotetasks.go (about)

     1  package remotetasks
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/aws/aws-sdk-go/aws"
    10  	"github.com/aws/aws-sdk-go/aws/session"
    11  	"github.com/aws/aws-sdk-go/service/ecs"
    12  	"github.com/hashicorp/nomad/api"
    13  	"github.com/hashicorp/nomad/e2e/e2eutil"
    14  	"github.com/hashicorp/nomad/e2e/framework"
    15  	"github.com/hashicorp/nomad/helper/uuid"
    16  	"github.com/hashicorp/nomad/jobspec2"
    17  	"github.com/hashicorp/nomad/plugins/base"
    18  	"github.com/hashicorp/nomad/testutil"
    19  	"github.com/stretchr/testify/assert"
    20  	"github.com/stretchr/testify/require"
    21  )
    22  
    23  const (
    24  	// ECS Task Statuses (currently unused statuses commented out to
    25  	// appease linter)
    26  	//ecsTaskStatusDeactivating   = "DEACTIVATING"
    27  	//ecsTaskStatusStopping       = "STOPPING"
    28  	//ecsTaskStatusDeprovisioning = "DEPROVISIONING"
    29  	ecsTaskStatusStopped = "STOPPED"
    30  	ecsTaskStatusRunning = "RUNNING"
    31  )
    32  
    33  type RemoteTasksTest struct {
    34  	framework.TC
    35  	jobIDs []string
    36  }
    37  
    38  func init() {
    39  	framework.AddSuites(&framework.TestSuite{
    40  		Component:   "RemoteTasks",
    41  		CanRunLocal: true,
    42  		Cases: []framework.TestCase{
    43  			new(RemoteTasksTest),
    44  		},
    45  	})
    46  }
    47  
    48  func (tc *RemoteTasksTest) BeforeAll(f *framework.F) {
    49  	e2eutil.WaitForLeader(f.T(), tc.Nomad())
    50  	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)
    51  }
    52  
    53  func (tc *RemoteTasksTest) AfterEach(f *framework.F) {
    54  	nomadClient := tc.Nomad()
    55  
    56  	// Mark all nodes eligible
    57  	nodesAPI := tc.Nomad().Nodes()
    58  	nodes, _, _ := nodesAPI.List(nil)
    59  	for _, node := range nodes {
    60  		nodesAPI.ToggleEligibility(node.ID, true, nil)
    61  	}
    62  
    63  	jobs := nomadClient.Jobs()
    64  	// Stop all jobs in test
    65  	for _, id := range tc.jobIDs {
    66  		jobs.Deregister(id, true, nil)
    67  	}
    68  	tc.jobIDs = []string{}
    69  
    70  	// Garbage collect
    71  	nomadClient.System().GarbageCollect()
    72  }
    73  
    74  // TestECSJob asserts an ECS job may be started and is cleaned up when stopped.
    75  func (tc *RemoteTasksTest) TestECSJob(f *framework.F) {
    76  	t := f.T()
    77  
    78  	ecsClient := ecsOrSkip(t, tc.Nomad())
    79  
    80  	jobID := "ecsjob-" + uuid.Generate()[0:8]
    81  	tc.jobIDs = append(tc.jobIDs, jobID)
    82  	_, allocs := registerECSJobs(t, tc.Nomad(), jobID)
    83  	require.Len(t, allocs, 1)
    84  	allocID := allocs[0].ID
    85  	e2eutil.WaitForAllocsRunning(t, tc.Nomad(), []string{allocID})
    86  
    87  	// We need to go from Allocation -> ECS ARN, so grab the updated
    88  	// allocation's task state.
    89  	arn := arnForAlloc(t, tc.Nomad().Allocations(), allocID)
    90  
    91  	// Use ARN to lookup status of ECS task in AWS
    92  	ensureECSRunning(t, ecsClient, arn)
    93  
    94  	t.Logf("Task %s is running!", arn)
    95  
    96  	// Stop the job
    97  	e2eutil.WaitForJobStopped(t, tc.Nomad(), jobID)
    98  
    99  	// Ensure it is stopped in ECS
   100  	input := ecs.DescribeTasksInput{
   101  		Cluster: aws.String("nomad-rtd-e2e"),
   102  		Tasks:   []*string{aws.String(arn)},
   103  	}
   104  	testutil.WaitForResult(func() (bool, error) {
   105  		resp, err := ecsClient.DescribeTasks(&input)
   106  		if err != nil {
   107  			return false, err
   108  		}
   109  		status := *resp.Tasks[0].LastStatus
   110  		return status == ecsTaskStatusStopped, fmt.Errorf("ecs task is not stopped: %s", status)
   111  	}, func(err error) {
   112  		t.Fatalf("error retrieving ecs task status: %v", err)
   113  	})
   114  }
   115  
   116  // TestECSDrain asserts an ECS job may be started, drained from one node, and
   117  // is managed by a new node without stopping and restarting the remote task.
   118  func (tc *RemoteTasksTest) TestECSDrain(f *framework.F) {
   119  	t := f.T()
   120  
   121  	ecsClient := ecsOrSkip(t, tc.Nomad())
   122  
   123  	jobID := "ecsjob-" + uuid.Generate()[0:8]
   124  	tc.jobIDs = append(tc.jobIDs, jobID)
   125  	_, allocs := registerECSJobs(t, tc.Nomad(), jobID)
   126  	require.Len(t, allocs, 1)
   127  	origNode := allocs[0].NodeID
   128  	origAlloc := allocs[0].ID
   129  	e2eutil.WaitForAllocsRunning(t, tc.Nomad(), []string{origAlloc})
   130  
   131  	arn := arnForAlloc(t, tc.Nomad().Allocations(), origAlloc)
   132  	ensureECSRunning(t, ecsClient, arn)
   133  
   134  	t.Logf("Task %s is running! Now to drain the node.", arn)
   135  
   136  	// Drain the node
   137  	_, err := tc.Nomad().Nodes().UpdateDrain(
   138  		origNode,
   139  		&api.DrainSpec{Deadline: 30 * time.Second},
   140  		false,
   141  		nil,
   142  	)
   143  	require.NoError(t, err, "error draining original node")
   144  
   145  	// Wait for new alloc to be running
   146  	var newAlloc *api.AllocationListStub
   147  	qopts := &api.QueryOptions{}
   148  	testutil.WaitForResult(func() (bool, error) {
   149  		allocs, resp, err := tc.Nomad().Jobs().Allocations(jobID, false, qopts)
   150  		if err != nil {
   151  			return false, fmt.Errorf("error retrieving allocations for job: %w", err)
   152  		}
   153  
   154  		qopts.WaitIndex = resp.LastIndex
   155  
   156  		if len(allocs) > 2 {
   157  			return false, fmt.Errorf("expected 1 or 2 allocs but found %d", len(allocs))
   158  		}
   159  
   160  		for _, alloc := range allocs {
   161  			if alloc.ID == origAlloc {
   162  				// This is the old alloc, skip it
   163  				continue
   164  			}
   165  
   166  			newAlloc = alloc
   167  
   168  			if newAlloc.ClientStatus == "running" {
   169  				break
   170  			}
   171  		}
   172  
   173  		if newAlloc == nil {
   174  			return false, fmt.Errorf("no new alloc found")
   175  		}
   176  		if newAlloc.ClientStatus != "running" {
   177  			return false, fmt.Errorf("expected new alloc (%s) to be running but found: %s",
   178  				newAlloc.ID, newAlloc.ClientStatus)
   179  		}
   180  
   181  		return true, nil
   182  	}, func(err error) {
   183  		t.Fatalf("error waiting for new alloc to be running: %v", err)
   184  	})
   185  
   186  	// Make sure the ARN hasn't changed by looking up the new alloc's ARN
   187  	newARN := arnForAlloc(t, tc.Nomad().Allocations(), newAlloc.ID)
   188  
   189  	assert.Equal(t, arn, newARN, "unexpected new ARN")
   190  }
   191  
   192  // TestECSDeployment asserts a new ECS task is started when an ECS job is
   193  // deployed.
   194  func (tc *RemoteTasksTest) TestECSDeployment(f *framework.F) {
   195  	t := f.T()
   196  
   197  	ecsClient := ecsOrSkip(t, tc.Nomad())
   198  
   199  	jobID := "ecsjob-" + uuid.Generate()[0:8]
   200  	tc.jobIDs = append(tc.jobIDs, jobID)
   201  	job, origAllocs := registerECSJobs(t, tc.Nomad(), jobID)
   202  	require.Len(t, origAllocs, 1)
   203  	origAllocID := origAllocs[0].ID
   204  	e2eutil.WaitForAllocsRunning(t, tc.Nomad(), []string{origAllocID})
   205  
   206  	// We need to go from Allocation -> ECS ARN, so grab the updated
   207  	// allocation's task state.
   208  	origARN := arnForAlloc(t, tc.Nomad().Allocations(), origAllocID)
   209  
   210  	// Use ARN to lookup status of ECS task in AWS
   211  	ensureECSRunning(t, ecsClient, origARN)
   212  
   213  	t.Logf("Task %s is running! Updating...", origARN)
   214  
   215  	// Force a deployment by updating meta
   216  	job.Meta = map[string]string{
   217  		"updated": time.Now().Format(time.RFC3339Nano),
   218  	}
   219  
   220  	// Register updated job
   221  	resp, _, err := tc.Nomad().Jobs().Register(job, nil)
   222  	require.NoError(t, err, "error registering updated job")
   223  	require.NotEmpty(t, resp.EvalID, "no eval id created when registering updated job")
   224  
   225  	// Wait for new alloc to be running
   226  	var newAlloc *api.AllocationListStub
   227  	testutil.WaitForResult(func() (bool, error) {
   228  		allocs, _, err := tc.Nomad().Jobs().Allocations(jobID, false, nil)
   229  		if err != nil {
   230  			return false, err
   231  		}
   232  
   233  		for _, a := range allocs {
   234  			if a.ID == origAllocID {
   235  				if a.ClientStatus == "complete" {
   236  					// Original alloc stopped as expected!
   237  					continue
   238  				}
   239  
   240  				// Original alloc is still running
   241  				newAlloc = nil
   242  				return false, fmt.Errorf("original alloc not yet terminal. "+
   243  					"client status: %s; desired status: %s",
   244  					a.ClientStatus, a.DesiredStatus)
   245  			}
   246  
   247  			if a.ClientStatus != "running" {
   248  				return false, fmt.Errorf("new alloc is not running: %s", a.ClientStatus)
   249  			}
   250  
   251  			if newAlloc != nil {
   252  				return false, fmt.Errorf("found 2 replacement allocs: %s and %s",
   253  					a.ID, newAlloc.ID)
   254  			}
   255  
   256  			newAlloc = a
   257  		}
   258  
   259  		return newAlloc != nil, fmt.Errorf("no new alloc found for updated job")
   260  	}, func(err error) {
   261  		require.NoError(t, err, "error waiting for updated alloc")
   262  	})
   263  
   264  	newARN := arnForAlloc(t, tc.Nomad().Allocations(), newAlloc.ID)
   265  	t.Logf("Task %s is updated!", newARN)
   266  	require.NotEqual(t, origARN, newARN, "expected new ARN")
   267  
   268  	// Ensure original ARN is stopped in ECS
   269  	input := ecs.DescribeTasksInput{
   270  		Cluster: aws.String("nomad-rtd-e2e"),
   271  		Tasks:   []*string{aws.String(origARN)},
   272  	}
   273  	testutil.WaitForResult(func() (bool, error) {
   274  		resp, err := ecsClient.DescribeTasks(&input)
   275  		if err != nil {
   276  			return false, err
   277  		}
   278  		status := *resp.Tasks[0].LastStatus
   279  		return status == ecsTaskStatusStopped, fmt.Errorf("original ecs task is not stopped: %s", status)
   280  	}, func(err error) {
   281  		t.Fatalf("error retrieving ecs task status for original ARN: %v", err)
   282  	})
   283  }
   284  
   285  // ecsOrSkip returns an AWS ECS client or skips the test if ECS is unreachable
   286  // by the test runner or the ECS remote task driver isn't healthy.
   287  func ecsOrSkip(t *testing.T, nomadClient *api.Client) *ecs.ECS {
   288  	awsSession := session.Must(session.NewSession())
   289  
   290  	ecsClient := ecs.New(awsSession, aws.NewConfig().WithRegion("us-east-1"))
   291  
   292  	_, err := ecsClient.ListClusters(&ecs.ListClustersInput{})
   293  	if err != nil {
   294  		t.Skipf("Skipping ECS Remote Task Driver Task. Error querying AWS ECS API: %v", err)
   295  	}
   296  
   297  	testutil.WaitForResult(func() (bool, error) {
   298  		nodes, _, err := nomadClient.Nodes().List(nil)
   299  		if err != nil {
   300  			return false, fmt.Errorf("error retrieving node listing: %w", err)
   301  		}
   302  
   303  		notReady := 0
   304  		notEligible := 0
   305  		noECS := 0
   306  		notHealthy := 0
   307  		ready := 0
   308  		for _, n := range nodes {
   309  			if n.Status != "ready" {
   310  				notReady++
   311  				continue
   312  			}
   313  			if n.SchedulingEligibility != "eligible" {
   314  				notEligible++
   315  				continue
   316  			}
   317  			ecsDriver, ok := n.Drivers["ecs"]
   318  			if !ok {
   319  				noECS++
   320  				continue
   321  			}
   322  			if !ecsDriver.Healthy {
   323  				notHealthy++
   324  				continue
   325  			}
   326  			ready++
   327  		}
   328  
   329  		return ready > 1, fmt.Errorf("expected 2 nodes with healthy ecs drivers but found: "+
   330  			"not_ready=%d ineligible=%d no_driver=%d unhealthy=%d ok=%d",
   331  			notReady, notEligible, noECS, notHealthy, ready)
   332  	}, func(err error) {
   333  		if err != nil {
   334  			t.Skipf("Skipping Remote Task Driver tests due to: %v", err)
   335  		}
   336  	})
   337  
   338  	return ecsClient
   339  }
   340  
   341  // arnForAlloc retrieves the ARN for a running allocation.
   342  func arnForAlloc(t *testing.T, allocAPI *api.Allocations, allocID string) string {
   343  	t.Logf("Retrieving ARN for alloc=%s", allocID)
   344  	ecsState := struct {
   345  		ARN string
   346  	}{}
   347  	testutil.WaitForResult(func() (bool, error) {
   348  		alloc, _, err := allocAPI.Info(allocID, nil)
   349  		if err != nil {
   350  			return false, err
   351  		}
   352  		state := alloc.TaskStates["http-server"]
   353  		if state == nil {
   354  			return false, fmt.Errorf("no task state for http-server (%d task states)", len(alloc.TaskStates))
   355  		}
   356  		if state.TaskHandle == nil {
   357  			return false, fmt.Errorf("no task handle for http-server")
   358  		}
   359  		if len(state.TaskHandle.DriverState) == 0 {
   360  			return false, fmt.Errorf("no driver state for task handle")
   361  		}
   362  		if err := base.MsgPackDecode(state.TaskHandle.DriverState, &ecsState); err != nil {
   363  			return false, fmt.Errorf("error decoding driver state: %w", err)
   364  		}
   365  		if ecsState.ARN == "" {
   366  			return false, fmt.Errorf("ARN is empty despite DriverState being %d bytes", len(state.TaskHandle.DriverState))
   367  		}
   368  		return true, nil
   369  	}, func(err error) {
   370  		t.Fatalf("error getting ARN: %v", err)
   371  	})
   372  	t.Logf("Retrieved ARN=%s for alloc=%s", ecsState.ARN, allocID)
   373  
   374  	return ecsState.ARN
   375  }
   376  
   377  // ensureECSRunning asserts that the given ARN is a running ECS task.
   378  func ensureECSRunning(t *testing.T, ecsClient *ecs.ECS, arn string) {
   379  	t.Logf("Ensuring ARN=%s is running", arn)
   380  	input := ecs.DescribeTasksInput{
   381  		Cluster: aws.String("nomad-rtd-e2e"),
   382  		Tasks:   []*string{aws.String(arn)},
   383  	}
   384  	testutil.WaitForResult(func() (bool, error) {
   385  		resp, err := ecsClient.DescribeTasks(&input)
   386  		if err != nil {
   387  			return false, err
   388  		}
   389  		status := *resp.Tasks[0].LastStatus
   390  		return status == ecsTaskStatusRunning, fmt.Errorf("ecs task is not running: %s", status)
   391  	}, func(err error) {
   392  		t.Fatalf("error retrieving ecs task status: %v", err)
   393  	})
   394  	t.Logf("ARN=%s is running", arn)
   395  }
   396  
   397  // registerECSJobs registers an ECS job and returns it and its allocation
   398  // stubs.
   399  func registerECSJobs(t *testing.T, nomadClient *api.Client, jobID string) (*api.Job, []*api.AllocationListStub) {
   400  	const (
   401  		jobPath = "remotetasks/input/ecs.nomad"
   402  		varPath = "remotetasks/input/ecs.vars"
   403  	)
   404  
   405  	jobBytes, err := os.ReadFile(jobPath)
   406  	require.NoError(t, err, "error reading job file")
   407  
   408  	job, err := jobspec2.ParseWithConfig(&jobspec2.ParseConfig{
   409  		Path:     jobPath,
   410  		Body:     jobBytes,
   411  		VarFiles: []string{varPath},
   412  		Strict:   true,
   413  	})
   414  	require.NoErrorf(t, err, "error parsing jobspec from %s with var file %s", jobPath, varPath)
   415  
   416  	job.ID = &jobID
   417  	job.Name = &jobID
   418  
   419  	// Register job
   420  	resp, _, err := nomadClient.Jobs().Register(job, nil)
   421  	require.NoError(t, err, "error registering job")
   422  	require.NotEmpty(t, resp.EvalID, "no eval id created when registering job")
   423  
   424  	var allocs []*api.AllocationListStub
   425  	testutil.WaitForResult(func() (bool, error) {
   426  		allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil)
   427  		if err != nil {
   428  			return false, err
   429  		}
   430  		return len(allocs) > 0, fmt.Errorf("no allocs found")
   431  	}, func(err error) {
   432  		require.NoErrorf(t, err, "error retrieving allocations for %s", jobID)
   433  	})
   434  	return job, allocs
   435  }