github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/remotetasks/remotetasks.go (about) 1 package remotetasks 2 3 import ( 4 "fmt" 5 "os" 6 "testing" 7 "time" 8 9 "github.com/aws/aws-sdk-go/aws" 10 "github.com/aws/aws-sdk-go/aws/session" 11 "github.com/aws/aws-sdk-go/service/ecs" 12 "github.com/hashicorp/nomad/api" 13 "github.com/hashicorp/nomad/e2e/e2eutil" 14 "github.com/hashicorp/nomad/e2e/framework" 15 "github.com/hashicorp/nomad/helper/uuid" 16 "github.com/hashicorp/nomad/jobspec2" 17 "github.com/hashicorp/nomad/plugins/base" 18 "github.com/hashicorp/nomad/testutil" 19 "github.com/stretchr/testify/assert" 20 "github.com/stretchr/testify/require" 21 ) 22 23 const ( 24 // ECS Task Statuses (currently unused statuses commented out to 25 // appease linter) 26 //ecsTaskStatusDeactivating = "DEACTIVATING" 27 //ecsTaskStatusStopping = "STOPPING" 28 //ecsTaskStatusDeprovisioning = "DEPROVISIONING" 29 ecsTaskStatusStopped = "STOPPED" 30 ecsTaskStatusRunning = "RUNNING" 31 ) 32 33 type RemoteTasksTest struct { 34 framework.TC 35 jobIDs []string 36 } 37 38 func init() { 39 framework.AddSuites(&framework.TestSuite{ 40 Component: "RemoteTasks", 41 CanRunLocal: true, 42 Cases: []framework.TestCase{ 43 new(RemoteTasksTest), 44 }, 45 }) 46 } 47 48 func (tc *RemoteTasksTest) BeforeAll(f *framework.F) { 49 e2eutil.WaitForLeader(f.T(), tc.Nomad()) 50 e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2) 51 } 52 53 func (tc *RemoteTasksTest) AfterEach(f *framework.F) { 54 nomadClient := tc.Nomad() 55 56 // Mark all nodes eligible 57 nodesAPI := tc.Nomad().Nodes() 58 nodes, _, _ := nodesAPI.List(nil) 59 for _, node := range nodes { 60 nodesAPI.ToggleEligibility(node.ID, true, nil) 61 } 62 63 jobs := nomadClient.Jobs() 64 // Stop all jobs in test 65 for _, id := range tc.jobIDs { 66 jobs.Deregister(id, true, nil) 67 } 68 tc.jobIDs = []string{} 69 70 // Garbage collect 71 nomadClient.System().GarbageCollect() 72 } 73 74 // TestECSJob asserts an ECS job may be started and is cleaned up when stopped. 75 func (tc *RemoteTasksTest) TestECSJob(f *framework.F) { 76 t := f.T() 77 78 ecsClient := ecsOrSkip(t, tc.Nomad()) 79 80 jobID := "ecsjob-" + uuid.Generate()[0:8] 81 tc.jobIDs = append(tc.jobIDs, jobID) 82 _, allocs := registerECSJobs(t, tc.Nomad(), jobID) 83 require.Len(t, allocs, 1) 84 allocID := allocs[0].ID 85 e2eutil.WaitForAllocsRunning(t, tc.Nomad(), []string{allocID}) 86 87 // We need to go from Allocation -> ECS ARN, so grab the updated 88 // allocation's task state. 89 arn := arnForAlloc(t, tc.Nomad().Allocations(), allocID) 90 91 // Use ARN to lookup status of ECS task in AWS 92 ensureECSRunning(t, ecsClient, arn) 93 94 t.Logf("Task %s is running!", arn) 95 96 // Stop the job 97 e2eutil.WaitForJobStopped(t, tc.Nomad(), jobID) 98 99 // Ensure it is stopped in ECS 100 input := ecs.DescribeTasksInput{ 101 Cluster: aws.String("nomad-rtd-e2e"), 102 Tasks: []*string{aws.String(arn)}, 103 } 104 testutil.WaitForResult(func() (bool, error) { 105 resp, err := ecsClient.DescribeTasks(&input) 106 if err != nil { 107 return false, err 108 } 109 status := *resp.Tasks[0].LastStatus 110 return status == ecsTaskStatusStopped, fmt.Errorf("ecs task is not stopped: %s", status) 111 }, func(err error) { 112 t.Fatalf("error retrieving ecs task status: %v", err) 113 }) 114 } 115 116 // TestECSDrain asserts an ECS job may be started, drained from one node, and 117 // is managed by a new node without stopping and restarting the remote task. 118 func (tc *RemoteTasksTest) TestECSDrain(f *framework.F) { 119 t := f.T() 120 121 ecsClient := ecsOrSkip(t, tc.Nomad()) 122 123 jobID := "ecsjob-" + uuid.Generate()[0:8] 124 tc.jobIDs = append(tc.jobIDs, jobID) 125 _, allocs := registerECSJobs(t, tc.Nomad(), jobID) 126 require.Len(t, allocs, 1) 127 origNode := allocs[0].NodeID 128 origAlloc := allocs[0].ID 129 e2eutil.WaitForAllocsRunning(t, tc.Nomad(), []string{origAlloc}) 130 131 arn := arnForAlloc(t, tc.Nomad().Allocations(), origAlloc) 132 ensureECSRunning(t, ecsClient, arn) 133 134 t.Logf("Task %s is running! Now to drain the node.", arn) 135 136 // Drain the node 137 _, err := tc.Nomad().Nodes().UpdateDrain( 138 origNode, 139 &api.DrainSpec{Deadline: 30 * time.Second}, 140 false, 141 nil, 142 ) 143 require.NoError(t, err, "error draining original node") 144 145 // Wait for new alloc to be running 146 var newAlloc *api.AllocationListStub 147 qopts := &api.QueryOptions{} 148 testutil.WaitForResult(func() (bool, error) { 149 allocs, resp, err := tc.Nomad().Jobs().Allocations(jobID, false, qopts) 150 if err != nil { 151 return false, fmt.Errorf("error retrieving allocations for job: %w", err) 152 } 153 154 qopts.WaitIndex = resp.LastIndex 155 156 if len(allocs) > 2 { 157 return false, fmt.Errorf("expected 1 or 2 allocs but found %d", len(allocs)) 158 } 159 160 for _, alloc := range allocs { 161 if alloc.ID == origAlloc { 162 // This is the old alloc, skip it 163 continue 164 } 165 166 newAlloc = alloc 167 168 if newAlloc.ClientStatus == "running" { 169 break 170 } 171 } 172 173 if newAlloc == nil { 174 return false, fmt.Errorf("no new alloc found") 175 } 176 if newAlloc.ClientStatus != "running" { 177 return false, fmt.Errorf("expected new alloc (%s) to be running but found: %s", 178 newAlloc.ID, newAlloc.ClientStatus) 179 } 180 181 return true, nil 182 }, func(err error) { 183 t.Fatalf("error waiting for new alloc to be running: %v", err) 184 }) 185 186 // Make sure the ARN hasn't changed by looking up the new alloc's ARN 187 newARN := arnForAlloc(t, tc.Nomad().Allocations(), newAlloc.ID) 188 189 assert.Equal(t, arn, newARN, "unexpected new ARN") 190 } 191 192 // TestECSDeployment asserts a new ECS task is started when an ECS job is 193 // deployed. 194 func (tc *RemoteTasksTest) TestECSDeployment(f *framework.F) { 195 t := f.T() 196 197 ecsClient := ecsOrSkip(t, tc.Nomad()) 198 199 jobID := "ecsjob-" + uuid.Generate()[0:8] 200 tc.jobIDs = append(tc.jobIDs, jobID) 201 job, origAllocs := registerECSJobs(t, tc.Nomad(), jobID) 202 require.Len(t, origAllocs, 1) 203 origAllocID := origAllocs[0].ID 204 e2eutil.WaitForAllocsRunning(t, tc.Nomad(), []string{origAllocID}) 205 206 // We need to go from Allocation -> ECS ARN, so grab the updated 207 // allocation's task state. 208 origARN := arnForAlloc(t, tc.Nomad().Allocations(), origAllocID) 209 210 // Use ARN to lookup status of ECS task in AWS 211 ensureECSRunning(t, ecsClient, origARN) 212 213 t.Logf("Task %s is running! Updating...", origARN) 214 215 // Force a deployment by updating meta 216 job.Meta = map[string]string{ 217 "updated": time.Now().Format(time.RFC3339Nano), 218 } 219 220 // Register updated job 221 resp, _, err := tc.Nomad().Jobs().Register(job, nil) 222 require.NoError(t, err, "error registering updated job") 223 require.NotEmpty(t, resp.EvalID, "no eval id created when registering updated job") 224 225 // Wait for new alloc to be running 226 var newAlloc *api.AllocationListStub 227 testutil.WaitForResult(func() (bool, error) { 228 allocs, _, err := tc.Nomad().Jobs().Allocations(jobID, false, nil) 229 if err != nil { 230 return false, err 231 } 232 233 for _, a := range allocs { 234 if a.ID == origAllocID { 235 if a.ClientStatus == "complete" { 236 // Original alloc stopped as expected! 237 continue 238 } 239 240 // Original alloc is still running 241 newAlloc = nil 242 return false, fmt.Errorf("original alloc not yet terminal. "+ 243 "client status: %s; desired status: %s", 244 a.ClientStatus, a.DesiredStatus) 245 } 246 247 if a.ClientStatus != "running" { 248 return false, fmt.Errorf("new alloc is not running: %s", a.ClientStatus) 249 } 250 251 if newAlloc != nil { 252 return false, fmt.Errorf("found 2 replacement allocs: %s and %s", 253 a.ID, newAlloc.ID) 254 } 255 256 newAlloc = a 257 } 258 259 return newAlloc != nil, fmt.Errorf("no new alloc found for updated job") 260 }, func(err error) { 261 require.NoError(t, err, "error waiting for updated alloc") 262 }) 263 264 newARN := arnForAlloc(t, tc.Nomad().Allocations(), newAlloc.ID) 265 t.Logf("Task %s is updated!", newARN) 266 require.NotEqual(t, origARN, newARN, "expected new ARN") 267 268 // Ensure original ARN is stopped in ECS 269 input := ecs.DescribeTasksInput{ 270 Cluster: aws.String("nomad-rtd-e2e"), 271 Tasks: []*string{aws.String(origARN)}, 272 } 273 testutil.WaitForResult(func() (bool, error) { 274 resp, err := ecsClient.DescribeTasks(&input) 275 if err != nil { 276 return false, err 277 } 278 status := *resp.Tasks[0].LastStatus 279 return status == ecsTaskStatusStopped, fmt.Errorf("original ecs task is not stopped: %s", status) 280 }, func(err error) { 281 t.Fatalf("error retrieving ecs task status for original ARN: %v", err) 282 }) 283 } 284 285 // ecsOrSkip returns an AWS ECS client or skips the test if ECS is unreachable 286 // by the test runner or the ECS remote task driver isn't healthy. 287 func ecsOrSkip(t *testing.T, nomadClient *api.Client) *ecs.ECS { 288 awsSession := session.Must(session.NewSession()) 289 290 ecsClient := ecs.New(awsSession, aws.NewConfig().WithRegion("us-east-1")) 291 292 _, err := ecsClient.ListClusters(&ecs.ListClustersInput{}) 293 if err != nil { 294 t.Skipf("Skipping ECS Remote Task Driver Task. Error querying AWS ECS API: %v", err) 295 } 296 297 testutil.WaitForResult(func() (bool, error) { 298 nodes, _, err := nomadClient.Nodes().List(nil) 299 if err != nil { 300 return false, fmt.Errorf("error retrieving node listing: %w", err) 301 } 302 303 notReady := 0 304 notEligible := 0 305 noECS := 0 306 notHealthy := 0 307 ready := 0 308 for _, n := range nodes { 309 if n.Status != "ready" { 310 notReady++ 311 continue 312 } 313 if n.SchedulingEligibility != "eligible" { 314 notEligible++ 315 continue 316 } 317 ecsDriver, ok := n.Drivers["ecs"] 318 if !ok { 319 noECS++ 320 continue 321 } 322 if !ecsDriver.Healthy { 323 notHealthy++ 324 continue 325 } 326 ready++ 327 } 328 329 return ready > 1, fmt.Errorf("expected 2 nodes with healthy ecs drivers but found: "+ 330 "not_ready=%d ineligible=%d no_driver=%d unhealthy=%d ok=%d", 331 notReady, notEligible, noECS, notHealthy, ready) 332 }, func(err error) { 333 if err != nil { 334 t.Skipf("Skipping Remote Task Driver tests due to: %v", err) 335 } 336 }) 337 338 return ecsClient 339 } 340 341 // arnForAlloc retrieves the ARN for a running allocation. 342 func arnForAlloc(t *testing.T, allocAPI *api.Allocations, allocID string) string { 343 t.Logf("Retrieving ARN for alloc=%s", allocID) 344 ecsState := struct { 345 ARN string 346 }{} 347 testutil.WaitForResult(func() (bool, error) { 348 alloc, _, err := allocAPI.Info(allocID, nil) 349 if err != nil { 350 return false, err 351 } 352 state := alloc.TaskStates["http-server"] 353 if state == nil { 354 return false, fmt.Errorf("no task state for http-server (%d task states)", len(alloc.TaskStates)) 355 } 356 if state.TaskHandle == nil { 357 return false, fmt.Errorf("no task handle for http-server") 358 } 359 if len(state.TaskHandle.DriverState) == 0 { 360 return false, fmt.Errorf("no driver state for task handle") 361 } 362 if err := base.MsgPackDecode(state.TaskHandle.DriverState, &ecsState); err != nil { 363 return false, fmt.Errorf("error decoding driver state: %w", err) 364 } 365 if ecsState.ARN == "" { 366 return false, fmt.Errorf("ARN is empty despite DriverState being %d bytes", len(state.TaskHandle.DriverState)) 367 } 368 return true, nil 369 }, func(err error) { 370 t.Fatalf("error getting ARN: %v", err) 371 }) 372 t.Logf("Retrieved ARN=%s for alloc=%s", ecsState.ARN, allocID) 373 374 return ecsState.ARN 375 } 376 377 // ensureECSRunning asserts that the given ARN is a running ECS task. 378 func ensureECSRunning(t *testing.T, ecsClient *ecs.ECS, arn string) { 379 t.Logf("Ensuring ARN=%s is running", arn) 380 input := ecs.DescribeTasksInput{ 381 Cluster: aws.String("nomad-rtd-e2e"), 382 Tasks: []*string{aws.String(arn)}, 383 } 384 testutil.WaitForResult(func() (bool, error) { 385 resp, err := ecsClient.DescribeTasks(&input) 386 if err != nil { 387 return false, err 388 } 389 status := *resp.Tasks[0].LastStatus 390 return status == ecsTaskStatusRunning, fmt.Errorf("ecs task is not running: %s", status) 391 }, func(err error) { 392 t.Fatalf("error retrieving ecs task status: %v", err) 393 }) 394 t.Logf("ARN=%s is running", arn) 395 } 396 397 // registerECSJobs registers an ECS job and returns it and its allocation 398 // stubs. 399 func registerECSJobs(t *testing.T, nomadClient *api.Client, jobID string) (*api.Job, []*api.AllocationListStub) { 400 const ( 401 jobPath = "remotetasks/input/ecs.nomad" 402 varPath = "remotetasks/input/ecs.vars" 403 ) 404 405 jobBytes, err := os.ReadFile(jobPath) 406 require.NoError(t, err, "error reading job file") 407 408 job, err := jobspec2.ParseWithConfig(&jobspec2.ParseConfig{ 409 Path: jobPath, 410 Body: jobBytes, 411 VarFiles: []string{varPath}, 412 Strict: true, 413 }) 414 require.NoErrorf(t, err, "error parsing jobspec from %s with var file %s", jobPath, varPath) 415 416 job.ID = &jobID 417 job.Name = &jobID 418 419 // Register job 420 resp, _, err := nomadClient.Jobs().Register(job, nil) 421 require.NoError(t, err, "error registering job") 422 require.NotEmpty(t, resp.EvalID, "no eval id created when registering job") 423 424 var allocs []*api.AllocationListStub 425 testutil.WaitForResult(func() (bool, error) { 426 allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil) 427 if err != nil { 428 return false, err 429 } 430 return len(allocs) > 0, fmt.Errorf("no allocs found") 431 }, func(err error) { 432 require.NoErrorf(t, err, "error retrieving allocations for %s", jobID) 433 }) 434 return job, allocs 435 }