github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/task_runner_test.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "io/ioutil" 8 "net/http" 9 "net/http/httptest" 10 "os" 11 "path/filepath" 12 "strings" 13 "testing" 14 "time" 15 16 "github.com/golang/snappy" 17 "github.com/hashicorp/nomad/client/allocdir" 18 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 19 "github.com/hashicorp/nomad/client/config" 20 "github.com/hashicorp/nomad/client/consul" 21 consulapi "github.com/hashicorp/nomad/client/consul" 22 "github.com/hashicorp/nomad/client/devicemanager" 23 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 24 cstate "github.com/hashicorp/nomad/client/state" 25 ctestutil "github.com/hashicorp/nomad/client/testutil" 26 "github.com/hashicorp/nomad/client/vaultclient" 27 agentconsul "github.com/hashicorp/nomad/command/agent/consul" 28 mockdriver "github.com/hashicorp/nomad/drivers/mock" 29 "github.com/hashicorp/nomad/drivers/rawexec" 30 "github.com/hashicorp/nomad/helper/testlog" 31 "github.com/hashicorp/nomad/helper/uuid" 32 "github.com/hashicorp/nomad/nomad/mock" 33 "github.com/hashicorp/nomad/nomad/structs" 34 "github.com/hashicorp/nomad/plugins/device" 35 "github.com/hashicorp/nomad/plugins/drivers" 36 "github.com/hashicorp/nomad/testutil" 37 "github.com/kr/pretty" 38 "github.com/stretchr/testify/assert" 39 "github.com/stretchr/testify/require" 40 ) 41 42 type MockTaskStateUpdater struct { 43 ch chan struct{} 44 } 45 46 func NewMockTaskStateUpdater() *MockTaskStateUpdater { 47 return &MockTaskStateUpdater{ 48 ch: make(chan struct{}, 1), 49 } 50 } 51 52 func (m *MockTaskStateUpdater) TaskStateUpdated() { 53 select { 54 case m.ch <- struct{}{}: 55 default: 56 } 57 } 58 59 // testTaskRunnerConfig returns a taskrunner.Config for the given alloc+task 60 // plus a cleanup func. 61 func testTaskRunnerConfig(t *testing.T, alloc *structs.Allocation, taskName string) (*Config, func()) { 62 logger := testlog.HCLogger(t) 63 clientConf, cleanup := config.TestClientConfig(t) 64 65 // Find the task 66 var thisTask *structs.Task 67 for _, tg := range alloc.Job.TaskGroups { 68 for _, task := range tg.Tasks { 69 if task.Name == taskName { 70 if thisTask != nil { 71 cleanup() 72 t.Fatalf("multiple tasks named %q; cannot use this helper", taskName) 73 } 74 thisTask = task 75 } 76 } 77 } 78 if thisTask == nil { 79 cleanup() 80 t.Fatalf("could not find task %q", taskName) 81 } 82 83 // Create the alloc dir + task dir 84 allocPath := filepath.Join(clientConf.AllocDir, alloc.ID) 85 allocDir := allocdir.NewAllocDir(logger, allocPath) 86 if err := allocDir.Build(); err != nil { 87 cleanup() 88 t.Fatalf("error building alloc dir: %v", err) 89 } 90 taskDir := allocDir.NewTaskDir(taskName) 91 92 trCleanup := func() { 93 if err := allocDir.Destroy(); err != nil { 94 t.Logf("error destroying alloc dir: %v", err) 95 } 96 cleanup() 97 } 98 99 // Create a closed channel to mock TaskHookCoordinator.startConditionForTask. 100 // Closed channel indicates this task is not blocked on prestart hooks. 101 closedCh := make(chan struct{}) 102 close(closedCh) 103 104 conf := &Config{ 105 Alloc: alloc, 106 ClientConfig: clientConf, 107 Task: thisTask, 108 TaskDir: taskDir, 109 Logger: clientConf.Logger, 110 Consul: consulapi.NewMockConsulServiceClient(t, logger), 111 ConsulSI: consulapi.NewMockServiceIdentitiesClient(), 112 Vault: vaultclient.NewMockVaultClient(), 113 StateDB: cstate.NoopDB{}, 114 StateUpdater: NewMockTaskStateUpdater(), 115 DeviceManager: devicemanager.NoopMockManager(), 116 DriverManager: drivermanager.TestDriverManager(t), 117 ServersContactedCh: make(chan struct{}), 118 StartConditionMetCtx: closedCh, 119 } 120 return conf, trCleanup 121 } 122 123 // runTestTaskRunner runs a TaskRunner and returns its configuration as well as 124 // a cleanup function that ensures the runner is stopped and cleaned up. Tests 125 // which need to change the Config *must* use testTaskRunnerConfig instead. 126 func runTestTaskRunner(t *testing.T, alloc *structs.Allocation, taskName string) (*TaskRunner, *Config, func()) { 127 config, cleanup := testTaskRunnerConfig(t, alloc, taskName) 128 129 tr, err := NewTaskRunner(config) 130 require.NoError(t, err) 131 go tr.Run() 132 133 return tr, config, func() { 134 tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 135 cleanup() 136 } 137 } 138 139 // TestTaskRunner_Restore_Running asserts restoring a running task does not 140 // rerun the task. 141 func TestTaskRunner_Restore_Running(t *testing.T) { 142 t.Parallel() 143 require := require.New(t) 144 145 alloc := mock.BatchAlloc() 146 alloc.Job.TaskGroups[0].Count = 1 147 task := alloc.Job.TaskGroups[0].Tasks[0] 148 task.Driver = "mock_driver" 149 task.Config = map[string]interface{}{ 150 "run_for": "2s", 151 } 152 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 153 conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between task runners 154 defer cleanup() 155 156 // Run the first TaskRunner 157 origTR, err := NewTaskRunner(conf) 158 require.NoError(err) 159 go origTR.Run() 160 defer origTR.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 161 162 // Wait for it to be running 163 testWaitForTaskToStart(t, origTR) 164 165 // Cause TR to exit without shutting down task 166 origTR.Shutdown() 167 168 // Start a new TaskRunner and make sure it does not rerun the task 169 newTR, err := NewTaskRunner(conf) 170 require.NoError(err) 171 172 // Do the Restore 173 require.NoError(newTR.Restore()) 174 175 go newTR.Run() 176 defer newTR.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 177 178 // Wait for new task runner to exit when the process does 179 <-newTR.WaitCh() 180 181 // Assert that the process was only started once 182 started := 0 183 state := newTR.TaskState() 184 require.Equal(structs.TaskStateDead, state.State) 185 for _, ev := range state.Events { 186 if ev.Type == structs.TaskStarted { 187 started++ 188 } 189 } 190 assert.Equal(t, 1, started) 191 } 192 193 // setupRestoreFailureTest starts a service, shuts down the task runner, and 194 // kills the task before restarting a new TaskRunner. The new TaskRunner is 195 // returned once it is running and waiting in pending along with a cleanup 196 // func. 197 func setupRestoreFailureTest(t *testing.T, alloc *structs.Allocation) (*TaskRunner, *Config, func()) { 198 t.Parallel() 199 200 task := alloc.Job.TaskGroups[0].Tasks[0] 201 task.Driver = "raw_exec" 202 task.Config = map[string]interface{}{ 203 "command": "sleep", 204 "args": []string{"30"}, 205 } 206 conf, cleanup1 := testTaskRunnerConfig(t, alloc, task.Name) 207 conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs 208 209 // Run the first TaskRunner 210 origTR, err := NewTaskRunner(conf) 211 require.NoError(t, err) 212 go origTR.Run() 213 cleanup2 := func() { 214 origTR.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 215 cleanup1() 216 } 217 218 // Wait for it to be running 219 testWaitForTaskToStart(t, origTR) 220 221 handle := origTR.getDriverHandle() 222 require.NotNil(t, handle) 223 taskID := handle.taskID 224 225 // Cause TR to exit without shutting down task 226 origTR.Shutdown() 227 228 // Get the driver 229 driverPlugin, err := conf.DriverManager.Dispense(rawexec.PluginID.Name) 230 require.NoError(t, err) 231 rawexecDriver := driverPlugin.(*rawexec.Driver) 232 233 // Assert the task is still running despite TR having exited 234 taskStatus, err := rawexecDriver.InspectTask(taskID) 235 require.NoError(t, err) 236 require.Equal(t, drivers.TaskStateRunning, taskStatus.State) 237 238 // Kill the task so it fails to recover when restore is called 239 require.NoError(t, rawexecDriver.DestroyTask(taskID, true)) 240 _, err = rawexecDriver.InspectTask(taskID) 241 require.EqualError(t, err, drivers.ErrTaskNotFound.Error()) 242 243 // Create a new TaskRunner and Restore the task 244 conf.ServersContactedCh = make(chan struct{}) 245 newTR, err := NewTaskRunner(conf) 246 require.NoError(t, err) 247 248 // Assert the TR will wait on servers because reattachment failed 249 require.NoError(t, newTR.Restore()) 250 require.True(t, newTR.waitOnServers) 251 252 // Start new TR 253 go newTR.Run() 254 cleanup3 := func() { 255 newTR.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 256 cleanup2() 257 cleanup1() 258 } 259 260 // Assert task has not been restarted 261 _, err = rawexecDriver.InspectTask(taskID) 262 require.EqualError(t, err, drivers.ErrTaskNotFound.Error()) 263 ts := newTR.TaskState() 264 require.Equal(t, structs.TaskStatePending, ts.State) 265 266 return newTR, conf, cleanup3 267 } 268 269 // TestTaskRunner_Restore_Restart asserts restoring a dead task blocks until 270 // MarkAlive is called. #1795 271 func TestTaskRunner_Restore_Restart(t *testing.T) { 272 newTR, conf, cleanup := setupRestoreFailureTest(t, mock.Alloc()) 273 defer cleanup() 274 275 // Fake contacting the server by closing the chan 276 close(conf.ServersContactedCh) 277 278 testutil.WaitForResult(func() (bool, error) { 279 ts := newTR.TaskState().State 280 return ts == structs.TaskStateRunning, fmt.Errorf("expected task to be running but found %q", ts) 281 }, func(err error) { 282 require.NoError(t, err) 283 }) 284 } 285 286 // TestTaskRunner_Restore_Kill asserts restoring a dead task blocks until 287 // the task is killed. #1795 288 func TestTaskRunner_Restore_Kill(t *testing.T) { 289 newTR, _, cleanup := setupRestoreFailureTest(t, mock.Alloc()) 290 defer cleanup() 291 292 // Sending the task a terminal update shouldn't kill it or unblock it 293 alloc := newTR.Alloc().Copy() 294 alloc.DesiredStatus = structs.AllocDesiredStatusStop 295 newTR.Update(alloc) 296 297 require.Equal(t, structs.TaskStatePending, newTR.TaskState().State) 298 299 // AllocRunner will immediately kill tasks after sending a terminal 300 // update. 301 newTR.Kill(context.Background(), structs.NewTaskEvent(structs.TaskKilling)) 302 303 select { 304 case <-newTR.WaitCh(): 305 // It died as expected! 306 case <-time.After(10 * time.Second): 307 require.Fail(t, "timeout waiting for task to die") 308 } 309 } 310 311 // TestTaskRunner_Restore_Update asserts restoring a dead task blocks until 312 // Update is called. #1795 313 func TestTaskRunner_Restore_Update(t *testing.T) { 314 newTR, conf, cleanup := setupRestoreFailureTest(t, mock.Alloc()) 315 defer cleanup() 316 317 // Fake Client.runAllocs behavior by calling Update then closing chan 318 alloc := newTR.Alloc().Copy() 319 newTR.Update(alloc) 320 321 // Update alone should not unblock the test 322 require.Equal(t, structs.TaskStatePending, newTR.TaskState().State) 323 324 // Fake Client.runAllocs behavior of closing chan after Update 325 close(conf.ServersContactedCh) 326 327 testutil.WaitForResult(func() (bool, error) { 328 ts := newTR.TaskState().State 329 return ts == structs.TaskStateRunning, fmt.Errorf("expected task to be running but found %q", ts) 330 }, func(err error) { 331 require.NoError(t, err) 332 }) 333 } 334 335 // TestTaskRunner_Restore_System asserts restoring a dead system task does not 336 // block. 337 func TestTaskRunner_Restore_System(t *testing.T) { 338 t.Parallel() 339 340 alloc := mock.Alloc() 341 alloc.Job.Type = structs.JobTypeSystem 342 task := alloc.Job.TaskGroups[0].Tasks[0] 343 task.Driver = "raw_exec" 344 task.Config = map[string]interface{}{ 345 "command": "sleep", 346 "args": []string{"30"}, 347 } 348 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 349 defer cleanup() 350 conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs 351 352 // Run the first TaskRunner 353 origTR, err := NewTaskRunner(conf) 354 require.NoError(t, err) 355 go origTR.Run() 356 defer origTR.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 357 358 // Wait for it to be running 359 testWaitForTaskToStart(t, origTR) 360 361 handle := origTR.getDriverHandle() 362 require.NotNil(t, handle) 363 taskID := handle.taskID 364 365 // Cause TR to exit without shutting down task 366 origTR.Shutdown() 367 368 // Get the driver 369 driverPlugin, err := conf.DriverManager.Dispense(rawexec.PluginID.Name) 370 require.NoError(t, err) 371 rawexecDriver := driverPlugin.(*rawexec.Driver) 372 373 // Assert the task is still running despite TR having exited 374 taskStatus, err := rawexecDriver.InspectTask(taskID) 375 require.NoError(t, err) 376 require.Equal(t, drivers.TaskStateRunning, taskStatus.State) 377 378 // Kill the task so it fails to recover when restore is called 379 require.NoError(t, rawexecDriver.DestroyTask(taskID, true)) 380 _, err = rawexecDriver.InspectTask(taskID) 381 require.EqualError(t, err, drivers.ErrTaskNotFound.Error()) 382 383 // Create a new TaskRunner and Restore the task 384 conf.ServersContactedCh = make(chan struct{}) 385 newTR, err := NewTaskRunner(conf) 386 require.NoError(t, err) 387 388 // Assert the TR will not wait on servers even though reattachment 389 // failed because it is a system task. 390 require.NoError(t, newTR.Restore()) 391 require.False(t, newTR.waitOnServers) 392 393 // Nothing should have closed the chan 394 select { 395 case <-conf.ServersContactedCh: 396 require.Fail(t, "serversContactedCh was closed but should not have been") 397 default: 398 } 399 400 testutil.WaitForResult(func() (bool, error) { 401 ts := newTR.TaskState().State 402 return ts == structs.TaskStateRunning, fmt.Errorf("expected task to be running but found %q", ts) 403 }, func(err error) { 404 require.NoError(t, err) 405 }) 406 } 407 408 // TestTaskRunner_TaskEnv_Interpolated asserts driver configurations are 409 // interpolated. 410 func TestTaskRunner_TaskEnv_Interpolated(t *testing.T) { 411 t.Parallel() 412 require := require.New(t) 413 414 alloc := mock.BatchAlloc() 415 alloc.Job.TaskGroups[0].Meta = map[string]string{ 416 "common_user": "somebody", 417 } 418 task := alloc.Job.TaskGroups[0].Tasks[0] 419 task.Meta = map[string]string{ 420 "foo": "bar", 421 } 422 423 // Use interpolation from both node attributes and meta vars 424 task.Config = map[string]interface{}{ 425 "run_for": "1ms", 426 "stdout_string": `${node.region} ${NOMAD_META_foo} ${NOMAD_META_common_user}`, 427 } 428 429 tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name) 430 defer cleanup() 431 432 // Wait for task to complete 433 select { 434 case <-tr.WaitCh(): 435 case <-time.After(3 * time.Second): 436 require.Fail("timeout waiting for task to exit") 437 } 438 439 // Get the mock driver plugin 440 driverPlugin, err := conf.DriverManager.Dispense(mockdriver.PluginID.Name) 441 require.NoError(err) 442 mockDriver := driverPlugin.(*mockdriver.Driver) 443 444 // Assert its config has been properly interpolated 445 driverCfg, mockCfg := mockDriver.GetTaskConfig() 446 require.NotNil(driverCfg) 447 require.NotNil(mockCfg) 448 assert.Equal(t, "global bar somebody", mockCfg.StdoutString) 449 } 450 451 // TestTaskRunner_TaskEnv_Chroot asserts chroot drivers use chroot paths and 452 // not host paths. 453 func TestTaskRunner_TaskEnv_Chroot(t *testing.T) { 454 ctestutil.ExecCompatible(t) 455 t.Parallel() 456 require := require.New(t) 457 458 alloc := mock.BatchAlloc() 459 task := alloc.Job.TaskGroups[0].Tasks[0] 460 task.Driver = "exec" 461 task.Config = map[string]interface{}{ 462 "command": "bash", 463 "args": []string{"-c", "echo $NOMAD_ALLOC_DIR; " + 464 "echo $NOMAD_TASK_DIR; " + 465 "echo $NOMAD_SECRETS_DIR; " + 466 "echo $PATH; ", 467 }, 468 } 469 470 // Expect chroot paths and host $PATH 471 exp := fmt.Sprintf(`/alloc 472 /local 473 /secrets 474 %s 475 `, os.Getenv("PATH")) 476 477 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 478 defer cleanup() 479 480 // Remove /sbin and /usr from chroot 481 conf.ClientConfig.ChrootEnv = map[string]string{ 482 "/bin": "/bin", 483 "/etc": "/etc", 484 "/lib": "/lib", 485 "/lib32": "/lib32", 486 "/lib64": "/lib64", 487 "/run/resolvconf": "/run/resolvconf", 488 } 489 490 tr, err := NewTaskRunner(conf) 491 require.NoError(err) 492 go tr.Run() 493 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 494 495 // Wait for task to exit 496 select { 497 case <-tr.WaitCh(): 498 case <-time.After(15 * time.Second): 499 require.Fail("timeout waiting for task to exit") 500 } 501 502 // Read stdout 503 p := filepath.Join(conf.TaskDir.LogDir, task.Name+".stdout.0") 504 stdout, err := ioutil.ReadFile(p) 505 require.NoError(err) 506 require.Equalf(exp, string(stdout), "expected: %s\n\nactual: %s\n", exp, stdout) 507 } 508 509 // TestTaskRunner_TaskEnv_Image asserts image drivers use chroot paths and 510 // not host paths. Host env vars should also be excluded. 511 func TestTaskRunner_TaskEnv_Image(t *testing.T) { 512 ctestutil.DockerCompatible(t) 513 t.Parallel() 514 require := require.New(t) 515 516 alloc := mock.BatchAlloc() 517 task := alloc.Job.TaskGroups[0].Tasks[0] 518 task.Driver = "docker" 519 task.Config = map[string]interface{}{ 520 "image": "redis:3.2-alpine", 521 "network_mode": "none", 522 "command": "sh", 523 "args": []string{"-c", "echo $NOMAD_ALLOC_DIR; " + 524 "echo $NOMAD_TASK_DIR; " + 525 "echo $NOMAD_SECRETS_DIR; " + 526 "echo $PATH", 527 }, 528 } 529 530 // Expect chroot paths and image specific PATH 531 exp := `/alloc 532 /local 533 /secrets 534 /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 535 ` 536 537 tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name) 538 defer cleanup() 539 540 // Wait for task to exit 541 select { 542 case <-tr.WaitCh(): 543 case <-time.After(15 * time.Second): 544 require.Fail("timeout waiting for task to exit") 545 } 546 547 // Read stdout 548 p := filepath.Join(conf.TaskDir.LogDir, task.Name+".stdout.0") 549 stdout, err := ioutil.ReadFile(p) 550 require.NoError(err) 551 require.Equalf(exp, string(stdout), "expected: %s\n\nactual: %s\n", exp, stdout) 552 } 553 554 // TestTaskRunner_TaskEnv_None asserts raw_exec uses host paths and env vars. 555 func TestTaskRunner_TaskEnv_None(t *testing.T) { 556 t.Parallel() 557 require := require.New(t) 558 559 alloc := mock.BatchAlloc() 560 task := alloc.Job.TaskGroups[0].Tasks[0] 561 task.Driver = "raw_exec" 562 task.Config = map[string]interface{}{ 563 "command": "sh", 564 "args": []string{"-c", "echo $NOMAD_ALLOC_DIR; " + 565 "echo $NOMAD_TASK_DIR; " + 566 "echo $NOMAD_SECRETS_DIR; " + 567 "echo $PATH", 568 }, 569 } 570 571 tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name) 572 defer cleanup() 573 574 // Expect host paths 575 root := filepath.Join(conf.ClientConfig.AllocDir, alloc.ID) 576 taskDir := filepath.Join(root, task.Name) 577 exp := fmt.Sprintf(`%s/alloc 578 %s/local 579 %s/secrets 580 %s 581 `, root, taskDir, taskDir, os.Getenv("PATH")) 582 583 // Wait for task to exit 584 select { 585 case <-tr.WaitCh(): 586 case <-time.After(15 * time.Second): 587 require.Fail("timeout waiting for task to exit") 588 } 589 590 // Read stdout 591 p := filepath.Join(conf.TaskDir.LogDir, task.Name+".stdout.0") 592 stdout, err := ioutil.ReadFile(p) 593 require.NoError(err) 594 require.Equalf(exp, string(stdout), "expected: %s\n\nactual: %s\n", exp, stdout) 595 } 596 597 // Test that devices get sent to the driver 598 func TestTaskRunner_DevicePropogation(t *testing.T) { 599 t.Parallel() 600 require := require.New(t) 601 602 // Create a mock alloc that has a gpu 603 alloc := mock.BatchAlloc() 604 alloc.Job.TaskGroups[0].Count = 1 605 task := alloc.Job.TaskGroups[0].Tasks[0] 606 task.Driver = "mock_driver" 607 task.Config = map[string]interface{}{ 608 "run_for": "100ms", 609 } 610 tRes := alloc.AllocatedResources.Tasks[task.Name] 611 tRes.Devices = append(tRes.Devices, &structs.AllocatedDeviceResource{Type: "mock"}) 612 613 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 614 conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between task runners 615 defer cleanup() 616 617 // Setup the devicemanager 618 dm, ok := conf.DeviceManager.(*devicemanager.MockManager) 619 require.True(ok) 620 621 dm.ReserveF = func(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) { 622 res := &device.ContainerReservation{ 623 Envs: map[string]string{ 624 "ABC": "123", 625 }, 626 Mounts: []*device.Mount{ 627 { 628 ReadOnly: true, 629 TaskPath: "foo", 630 HostPath: "bar", 631 }, 632 }, 633 Devices: []*device.DeviceSpec{ 634 { 635 TaskPath: "foo", 636 HostPath: "bar", 637 CgroupPerms: "123", 638 }, 639 }, 640 } 641 return res, nil 642 } 643 644 // Run the TaskRunner 645 tr, err := NewTaskRunner(conf) 646 require.NoError(err) 647 go tr.Run() 648 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 649 650 // Wait for task to complete 651 select { 652 case <-tr.WaitCh(): 653 case <-time.After(3 * time.Second): 654 } 655 656 // Get the mock driver plugin 657 driverPlugin, err := conf.DriverManager.Dispense(mockdriver.PluginID.Name) 658 require.NoError(err) 659 mockDriver := driverPlugin.(*mockdriver.Driver) 660 661 // Assert its config has been properly interpolated 662 driverCfg, _ := mockDriver.GetTaskConfig() 663 require.NotNil(driverCfg) 664 require.Len(driverCfg.Devices, 1) 665 require.Equal(driverCfg.Devices[0].Permissions, "123") 666 require.Len(driverCfg.Mounts, 1) 667 require.Equal(driverCfg.Mounts[0].TaskPath, "foo") 668 require.Contains(driverCfg.Env, "ABC") 669 } 670 671 // mockEnvHook is a test hook that sets an env var and done=true. It fails if 672 // it's called more than once. 673 type mockEnvHook struct { 674 called int 675 } 676 677 func (*mockEnvHook) Name() string { 678 return "mock_env_hook" 679 } 680 681 func (h *mockEnvHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error { 682 h.called++ 683 684 resp.Done = true 685 resp.Env = map[string]string{ 686 "mock_hook": "1", 687 } 688 689 return nil 690 } 691 692 // TestTaskRunner_Restore_HookEnv asserts that re-running prestart hooks with 693 // hook environments set restores the environment without re-running done 694 // hooks. 695 func TestTaskRunner_Restore_HookEnv(t *testing.T) { 696 t.Parallel() 697 require := require.New(t) 698 699 alloc := mock.BatchAlloc() 700 task := alloc.Job.TaskGroups[0].Tasks[0] 701 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 702 conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between prestart calls 703 defer cleanup() 704 705 tr, err := NewTaskRunner(conf) 706 require.NoError(err) 707 708 // Override the default hooks to only run the mock hook 709 mockHook := &mockEnvHook{} 710 tr.runnerHooks = []interfaces.TaskHook{mockHook} 711 712 // Manually run prestart hooks 713 require.NoError(tr.prestart()) 714 715 // Assert env was called 716 require.Equal(1, mockHook.called) 717 718 // Re-running prestart hooks should *not* call done mock hook 719 require.NoError(tr.prestart()) 720 721 // Assert env was called 722 require.Equal(1, mockHook.called) 723 724 // Assert the env is still set 725 env := tr.envBuilder.Build().All() 726 require.Contains(env, "mock_hook") 727 require.Equal("1", env["mock_hook"]) 728 } 729 730 // This test asserts that we can recover from an "external" plugin exiting by 731 // retrieving a new instance of the driver and recovering the task. 732 func TestTaskRunner_RecoverFromDriverExiting(t *testing.T) { 733 t.Parallel() 734 require := require.New(t) 735 736 // Create an allocation using the mock driver that exits simulating the 737 // driver crashing. We can then test that the task runner recovers from this 738 alloc := mock.BatchAlloc() 739 task := alloc.Job.TaskGroups[0].Tasks[0] 740 task.Driver = "mock_driver" 741 task.Config = map[string]interface{}{ 742 "plugin_exit_after": "1s", 743 "run_for": "5s", 744 } 745 746 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 747 conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between prestart calls 748 defer cleanup() 749 750 tr, err := NewTaskRunner(conf) 751 require.NoError(err) 752 753 start := time.Now() 754 go tr.Run() 755 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 756 757 // Wait for the task to be running 758 testWaitForTaskToStart(t, tr) 759 760 // Get the task ID 761 tr.stateLock.RLock() 762 l := tr.localState.TaskHandle 763 require.NotNil(l) 764 require.NotNil(l.Config) 765 require.NotEmpty(l.Config.ID) 766 id := l.Config.ID 767 tr.stateLock.RUnlock() 768 769 // Get the mock driver plugin 770 driverPlugin, err := conf.DriverManager.Dispense(mockdriver.PluginID.Name) 771 require.NoError(err) 772 mockDriver := driverPlugin.(*mockdriver.Driver) 773 774 // Wait for the task to start 775 testutil.WaitForResult(func() (bool, error) { 776 // Get the handle and check that it was recovered 777 handle := mockDriver.GetHandle(id) 778 if handle == nil { 779 return false, fmt.Errorf("nil handle") 780 } 781 if !handle.Recovered { 782 return false, fmt.Errorf("handle not recovered") 783 } 784 return true, nil 785 }, func(err error) { 786 t.Fatal(err.Error()) 787 }) 788 789 // Wait for task to complete 790 select { 791 case <-tr.WaitCh(): 792 case <-time.After(10 * time.Second): 793 } 794 795 // Ensure that we actually let the task complete 796 require.True(time.Now().Sub(start) > 5*time.Second) 797 798 // Check it finished successfully 799 state := tr.TaskState() 800 require.True(state.Successful()) 801 } 802 803 // TestTaskRunner_ShutdownDelay asserts services are removed from Consul 804 // ${shutdown_delay} seconds before killing the process. 805 func TestTaskRunner_ShutdownDelay(t *testing.T) { 806 t.Parallel() 807 808 alloc := mock.Alloc() 809 task := alloc.Job.TaskGroups[0].Tasks[0] 810 task.Services[0].Tags = []string{"tag1"} 811 task.Services = task.Services[:1] // only need 1 for this test 812 task.Driver = "mock_driver" 813 task.Config = map[string]interface{}{ 814 "run_for": "1000s", 815 } 816 817 // No shutdown escape hatch for this delay, so don't set it too high 818 task.ShutdownDelay = 1000 * time.Duration(testutil.TestMultiplier()) * time.Millisecond 819 820 tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name) 821 defer cleanup() 822 823 mockConsul := conf.Consul.(*consul.MockConsulServiceClient) 824 825 // Wait for the task to start 826 testWaitForTaskToStart(t, tr) 827 828 testutil.WaitForResult(func() (bool, error) { 829 ops := mockConsul.GetOps() 830 if n := len(ops); n != 1 { 831 return false, fmt.Errorf("expected 1 consul operation. Found %d", n) 832 } 833 return ops[0].Op == "add", fmt.Errorf("consul operation was not a registration: %#v", ops[0]) 834 }, func(err error) { 835 t.Fatalf("err: %v", err) 836 }) 837 838 // Asynchronously kill task 839 killSent := time.Now() 840 killed := make(chan struct{}) 841 go func() { 842 defer close(killed) 843 assert.NoError(t, tr.Kill(context.Background(), structs.NewTaskEvent("test"))) 844 }() 845 846 // Wait for *2* deregistration calls (due to needing to remove both 847 // canary tag variants) 848 WAIT: 849 for { 850 ops := mockConsul.GetOps() 851 switch n := len(ops); n { 852 case 1, 2: 853 // Waiting for both deregistration calls 854 case 3: 855 require.Equalf(t, "remove", ops[1].Op, "expected deregistration but found: %#v", ops[1]) 856 require.Equalf(t, "remove", ops[2].Op, "expected deregistration but found: %#v", ops[2]) 857 break WAIT 858 default: 859 // ?! 860 t.Fatalf("unexpected number of consul operations: %d\n%s", n, pretty.Sprint(ops)) 861 862 } 863 864 select { 865 case <-killed: 866 t.Fatal("killed while service still registered") 867 case <-time.After(10 * time.Millisecond): 868 } 869 } 870 871 // Wait for actual exit 872 select { 873 case <-tr.WaitCh(): 874 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 875 t.Fatalf("timeout") 876 } 877 878 <-killed 879 killDur := time.Now().Sub(killSent) 880 if killDur < task.ShutdownDelay { 881 t.Fatalf("task killed before shutdown_delay (killed_after: %s; shutdown_delay: %s", 882 killDur, task.ShutdownDelay, 883 ) 884 } 885 } 886 887 // TestTaskRunner_Dispatch_Payload asserts that a dispatch job runs and the 888 // payload was written to disk. 889 func TestTaskRunner_Dispatch_Payload(t *testing.T) { 890 t.Parallel() 891 892 alloc := mock.BatchAlloc() 893 task := alloc.Job.TaskGroups[0].Tasks[0] 894 task.Driver = "mock_driver" 895 task.Config = map[string]interface{}{ 896 "run_for": "1s", 897 } 898 899 fileName := "test" 900 task.DispatchPayload = &structs.DispatchPayloadConfig{ 901 File: fileName, 902 } 903 alloc.Job.ParameterizedJob = &structs.ParameterizedJobConfig{} 904 905 // Add a payload (they're snappy encoded bytes) 906 expected := []byte("hello world") 907 compressed := snappy.Encode(nil, expected) 908 alloc.Job.Payload = compressed 909 910 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 911 defer cleanup() 912 913 // Wait for it to finish 914 testutil.WaitForResult(func() (bool, error) { 915 ts := tr.TaskState() 916 return ts.State == structs.TaskStateDead, fmt.Errorf("%v", ts.State) 917 }, func(err error) { 918 require.NoError(t, err) 919 }) 920 921 // Should have exited successfully 922 ts := tr.TaskState() 923 require.False(t, ts.Failed) 924 require.Zero(t, ts.Restarts) 925 926 // Check that the file was written to disk properly 927 payloadPath := filepath.Join(tr.taskDir.LocalDir, fileName) 928 data, err := ioutil.ReadFile(payloadPath) 929 require.NoError(t, err) 930 require.Equal(t, expected, data) 931 } 932 933 // TestTaskRunner_SignalFailure asserts that signal errors are properly 934 // propagated from the driver to TaskRunner. 935 func TestTaskRunner_SignalFailure(t *testing.T) { 936 t.Parallel() 937 938 alloc := mock.Alloc() 939 task := alloc.Job.TaskGroups[0].Tasks[0] 940 task.Driver = "mock_driver" 941 errMsg := "test forcing failure" 942 task.Config = map[string]interface{}{ 943 "run_for": "10m", 944 "signal_error": errMsg, 945 } 946 947 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 948 defer cleanup() 949 950 testWaitForTaskToStart(t, tr) 951 952 require.EqualError(t, tr.Signal(&structs.TaskEvent{}, "SIGINT"), errMsg) 953 } 954 955 // TestTaskRunner_RestartTask asserts that restarting a task works and emits a 956 // Restarting event. 957 func TestTaskRunner_RestartTask(t *testing.T) { 958 t.Parallel() 959 960 alloc := mock.Alloc() 961 task := alloc.Job.TaskGroups[0].Tasks[0] 962 task.Driver = "mock_driver" 963 task.Config = map[string]interface{}{ 964 "run_for": "10m", 965 } 966 967 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 968 defer cleanup() 969 970 testWaitForTaskToStart(t, tr) 971 972 // Restart task. Send a RestartSignal event like check watcher. Restart 973 // handler emits the Restarting event. 974 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason("test") 975 const fail = false 976 tr.Restart(context.Background(), event.Copy(), fail) 977 978 // Wait for it to restart and be running again 979 testutil.WaitForResult(func() (bool, error) { 980 ts := tr.TaskState() 981 if ts.Restarts != 1 { 982 return false, fmt.Errorf("expected 1 restart but found %d\nevents: %s", 983 ts.Restarts, pretty.Sprint(ts.Events)) 984 } 985 if ts.State != structs.TaskStateRunning { 986 return false, fmt.Errorf("expected running but received %s", ts.State) 987 } 988 return true, nil 989 }, func(err error) { 990 require.NoError(t, err) 991 }) 992 993 // Assert the expected Restarting event was emitted 994 found := false 995 events := tr.TaskState().Events 996 for _, e := range events { 997 if e.Type == structs.TaskRestartSignal { 998 found = true 999 require.Equal(t, event.Time, e.Time) 1000 require.Equal(t, event.RestartReason, e.RestartReason) 1001 require.Contains(t, e.DisplayMessage, event.RestartReason) 1002 } 1003 } 1004 require.True(t, found, "restarting task event not found", pretty.Sprint(events)) 1005 } 1006 1007 // TestTaskRunner_CheckWatcher_Restart asserts that when enabled an unhealthy 1008 // Consul check will cause a task to restart following restart policy rules. 1009 func TestTaskRunner_CheckWatcher_Restart(t *testing.T) { 1010 t.Parallel() 1011 1012 alloc := mock.Alloc() 1013 1014 // Make the restart policy fail within this test 1015 tg := alloc.Job.TaskGroups[0] 1016 tg.RestartPolicy.Attempts = 2 1017 tg.RestartPolicy.Interval = 1 * time.Minute 1018 tg.RestartPolicy.Delay = 10 * time.Millisecond 1019 tg.RestartPolicy.Mode = structs.RestartPolicyModeFail 1020 1021 task := tg.Tasks[0] 1022 task.Driver = "mock_driver" 1023 task.Config = map[string]interface{}{ 1024 "run_for": "10m", 1025 } 1026 1027 // Make the task register a check that fails 1028 task.Services[0].Checks[0] = &structs.ServiceCheck{ 1029 Name: "test-restarts", 1030 Type: structs.ServiceCheckTCP, 1031 Interval: 50 * time.Millisecond, 1032 CheckRestart: &structs.CheckRestart{ 1033 Limit: 2, 1034 Grace: 100 * time.Millisecond, 1035 }, 1036 } 1037 1038 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1039 defer cleanup() 1040 1041 // Replace mock Consul ServiceClient, with the real ServiceClient 1042 // backed by a mock consul whose checks are always unhealthy. 1043 consulAgent := agentconsul.NewMockAgent() 1044 consulAgent.SetStatus("critical") 1045 consulClient := agentconsul.NewServiceClient(consulAgent, conf.Logger, true) 1046 go consulClient.Run() 1047 defer consulClient.Shutdown() 1048 1049 conf.Consul = consulClient 1050 1051 tr, err := NewTaskRunner(conf) 1052 require.NoError(t, err) 1053 1054 expectedEvents := []string{ 1055 "Received", 1056 "Task Setup", 1057 "Started", 1058 "Restart Signaled", 1059 "Terminated", 1060 "Restarting", 1061 "Started", 1062 "Restart Signaled", 1063 "Terminated", 1064 "Restarting", 1065 "Started", 1066 "Restart Signaled", 1067 "Terminated", 1068 "Not Restarting", 1069 } 1070 1071 // Bump maxEvents so task events aren't dropped 1072 tr.maxEvents = 100 1073 1074 go tr.Run() 1075 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1076 1077 // Wait until the task exits. Don't simply wait for it to run as it may 1078 // get restarted and terminated before the test is able to observe it 1079 // running. 1080 select { 1081 case <-tr.WaitCh(): 1082 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1083 require.Fail(t, "timeout") 1084 } 1085 1086 state := tr.TaskState() 1087 actualEvents := make([]string, len(state.Events)) 1088 for i, e := range state.Events { 1089 actualEvents[i] = string(e.Type) 1090 } 1091 require.Equal(t, actualEvents, expectedEvents) 1092 require.Equal(t, structs.TaskStateDead, state.State) 1093 require.True(t, state.Failed, pretty.Sprint(state)) 1094 } 1095 1096 type mockEnvoyBootstrapHook struct { 1097 // nothing 1098 } 1099 1100 func (_ *mockEnvoyBootstrapHook) Name() string { 1101 return "mock_envoy_bootstrap" 1102 } 1103 1104 func (_ *mockEnvoyBootstrapHook) Prestart(_ context.Context, _ *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error { 1105 resp.Done = true 1106 return nil 1107 } 1108 1109 // The envoy bootstrap hook tries to connect to consul and run the envoy 1110 // bootstrap command, so turn it off when testing connect jobs that are not 1111 // using envoy. 1112 func useMockEnvoyBootstrapHook(tr *TaskRunner) { 1113 mock := new(mockEnvoyBootstrapHook) 1114 for i, hook := range tr.runnerHooks { 1115 if _, ok := hook.(*envoyBootstrapHook); ok { 1116 tr.runnerHooks[i] = mock 1117 } 1118 } 1119 } 1120 1121 // TestTaskRunner_BlockForSIDSToken asserts tasks do not start until a Consul 1122 // Service Identity token is derived. 1123 func TestTaskRunner_BlockForSIDSToken(t *testing.T) { 1124 t.Parallel() 1125 r := require.New(t) 1126 1127 alloc := mock.BatchConnectAlloc() 1128 task := alloc.Job.TaskGroups[0].Tasks[0] 1129 task.Config = map[string]interface{}{ 1130 "run_for": "0s", 1131 } 1132 1133 trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1134 defer cleanup() 1135 1136 // set a consul token on the Nomad client's consul config, because that is 1137 // what gates the action of requesting SI token(s) 1138 trConfig.ClientConfig.ConsulConfig.Token = uuid.Generate() 1139 1140 // control when we get a Consul SI token 1141 token := uuid.Generate() 1142 waitCh := make(chan struct{}) 1143 deriveFn := func(*structs.Allocation, []string) (map[string]string, error) { 1144 <-waitCh 1145 return map[string]string{task.Name: token}, nil 1146 } 1147 siClient := trConfig.ConsulSI.(*consulapi.MockServiceIdentitiesClient) 1148 siClient.DeriveTokenFn = deriveFn 1149 1150 // start the task runner 1151 tr, err := NewTaskRunner(trConfig) 1152 r.NoError(err) 1153 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1154 useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap hook 1155 1156 go tr.Run() 1157 1158 // assert task runner blocks on SI token 1159 select { 1160 case <-tr.WaitCh(): 1161 r.Fail("task_runner exited before si unblocked") 1162 case <-time.After(100 * time.Millisecond): 1163 } 1164 1165 // assert task state is still pending 1166 r.Equal(structs.TaskStatePending, tr.TaskState().State) 1167 1168 // unblock service identity token 1169 close(waitCh) 1170 1171 // task runner should exit now that it has been unblocked and it is a batch 1172 // job with a zero sleep time 1173 select { 1174 case <-tr.WaitCh(): 1175 case <-time.After(15 * time.Second * time.Duration(testutil.TestMultiplier())): 1176 r.Fail("timed out waiting for batch task to exist") 1177 } 1178 1179 // assert task exited successfully 1180 finalState := tr.TaskState() 1181 r.Equal(structs.TaskStateDead, finalState.State) 1182 r.False(finalState.Failed) 1183 1184 // assert the token is on disk 1185 tokenPath := filepath.Join(trConfig.TaskDir.SecretsDir, sidsTokenFile) 1186 data, err := ioutil.ReadFile(tokenPath) 1187 r.NoError(err) 1188 r.Equal(token, string(data)) 1189 } 1190 1191 func TestTaskRunner_DeriveSIToken_Retry(t *testing.T) { 1192 t.Parallel() 1193 r := require.New(t) 1194 1195 alloc := mock.BatchConnectAlloc() 1196 task := alloc.Job.TaskGroups[0].Tasks[0] 1197 task.Config = map[string]interface{}{ 1198 "run_for": "0s", 1199 } 1200 1201 trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1202 defer cleanup() 1203 1204 // set a consul token on the Nomad client's consul config, because that is 1205 // what gates the action of requesting SI token(s) 1206 trConfig.ClientConfig.ConsulConfig.Token = uuid.Generate() 1207 1208 // control when we get a Consul SI token (recoverable failure on first call) 1209 token := uuid.Generate() 1210 deriveCount := 0 1211 deriveFn := func(*structs.Allocation, []string) (map[string]string, error) { 1212 if deriveCount > 0 { 1213 1214 return map[string]string{task.Name: token}, nil 1215 } 1216 deriveCount++ 1217 return nil, structs.NewRecoverableError(errors.New("try again later"), true) 1218 } 1219 siClient := trConfig.ConsulSI.(*consulapi.MockServiceIdentitiesClient) 1220 siClient.DeriveTokenFn = deriveFn 1221 1222 // start the task runner 1223 tr, err := NewTaskRunner(trConfig) 1224 r.NoError(err) 1225 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1226 useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap 1227 go tr.Run() 1228 1229 // assert task runner blocks on SI token 1230 select { 1231 case <-tr.WaitCh(): 1232 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1233 r.Fail("timed out waiting for task runner") 1234 } 1235 1236 // assert task exited successfully 1237 finalState := tr.TaskState() 1238 r.Equal(structs.TaskStateDead, finalState.State) 1239 r.False(finalState.Failed) 1240 1241 // assert the token is on disk 1242 tokenPath := filepath.Join(trConfig.TaskDir.SecretsDir, sidsTokenFile) 1243 data, err := ioutil.ReadFile(tokenPath) 1244 r.NoError(err) 1245 r.Equal(token, string(data)) 1246 } 1247 1248 // TestTaskRunner_DeriveSIToken_Unrecoverable asserts that an unrecoverable error 1249 // from deriving a service identity token will fail a task. 1250 func TestTaskRunner_DeriveSIToken_Unrecoverable(t *testing.T) { 1251 t.Parallel() 1252 r := require.New(t) 1253 1254 alloc := mock.BatchConnectAlloc() 1255 tg := alloc.Job.TaskGroups[0] 1256 tg.RestartPolicy.Attempts = 0 1257 tg.RestartPolicy.Interval = 0 1258 tg.RestartPolicy.Delay = 0 1259 tg.RestartPolicy.Mode = structs.RestartPolicyModeFail 1260 task := tg.Tasks[0] 1261 task.Config = map[string]interface{}{ 1262 "run_for": "0s", 1263 } 1264 1265 trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1266 defer cleanup() 1267 1268 // set a consul token on the Nomad client's consul config, because that is 1269 // what gates the action of requesting SI token(s) 1270 trConfig.ClientConfig.ConsulConfig.Token = uuid.Generate() 1271 1272 // SI token derivation suffers a non-retryable error 1273 siClient := trConfig.ConsulSI.(*consulapi.MockServiceIdentitiesClient) 1274 siClient.SetDeriveTokenError(alloc.ID, []string{task.Name}, errors.New("non-recoverable")) 1275 1276 tr, err := NewTaskRunner(trConfig) 1277 r.NoError(err) 1278 1279 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1280 useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap hook 1281 go tr.Run() 1282 1283 // Wait for the task to die 1284 select { 1285 case <-tr.WaitCh(): 1286 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1287 require.Fail(t, "timed out waiting for task runner to fail") 1288 } 1289 1290 // assert we have died and failed 1291 finalState := tr.TaskState() 1292 r.Equal(structs.TaskStateDead, finalState.State) 1293 r.True(finalState.Failed) 1294 r.Equal(5, len(finalState.Events)) 1295 /* 1296 + event: Task received by client 1297 + event: Building Task Directory 1298 + event: consul: failed to derive SI token: non-recoverable 1299 + event: consul_sids: context canceled 1300 + event: Policy allows no restarts 1301 */ 1302 r.Equal("true", finalState.Events[2].Details["fails_task"]) 1303 } 1304 1305 // TestTaskRunner_BlockForVaultToken asserts tasks do not start until a vault token 1306 // is derived. 1307 func TestTaskRunner_BlockForVaultToken(t *testing.T) { 1308 t.Parallel() 1309 1310 alloc := mock.BatchAlloc() 1311 task := alloc.Job.TaskGroups[0].Tasks[0] 1312 task.Config = map[string]interface{}{ 1313 "run_for": "0s", 1314 } 1315 task.Vault = &structs.Vault{Policies: []string{"default"}} 1316 1317 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1318 defer cleanup() 1319 1320 // Control when we get a Vault token 1321 token := "1234" 1322 waitCh := make(chan struct{}) 1323 handler := func(*structs.Allocation, []string) (map[string]string, error) { 1324 <-waitCh 1325 return map[string]string{task.Name: token}, nil 1326 } 1327 vaultClient := conf.Vault.(*vaultclient.MockVaultClient) 1328 vaultClient.DeriveTokenFn = handler 1329 1330 tr, err := NewTaskRunner(conf) 1331 require.NoError(t, err) 1332 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1333 go tr.Run() 1334 1335 // Assert TR blocks on vault token (does *not* exit) 1336 select { 1337 case <-tr.WaitCh(): 1338 require.Fail(t, "tr exited before vault unblocked") 1339 case <-time.After(1 * time.Second): 1340 } 1341 1342 // Assert task state is still Pending 1343 require.Equal(t, structs.TaskStatePending, tr.TaskState().State) 1344 1345 // Unblock vault token 1346 close(waitCh) 1347 1348 // TR should exit now that it's unblocked by vault as its a batch job 1349 // with 0 sleeping. 1350 select { 1351 case <-tr.WaitCh(): 1352 case <-time.After(15 * time.Second * time.Duration(testutil.TestMultiplier())): 1353 require.Fail(t, "timed out waiting for batch task to exit") 1354 } 1355 1356 // Assert task exited successfully 1357 finalState := tr.TaskState() 1358 require.Equal(t, structs.TaskStateDead, finalState.State) 1359 require.False(t, finalState.Failed) 1360 1361 // Check that the token is on disk 1362 tokenPath := filepath.Join(conf.TaskDir.SecretsDir, vaultTokenFile) 1363 data, err := ioutil.ReadFile(tokenPath) 1364 require.NoError(t, err) 1365 require.Equal(t, token, string(data)) 1366 1367 // Check the token was revoked 1368 testutil.WaitForResult(func() (bool, error) { 1369 if len(vaultClient.StoppedTokens()) != 1 { 1370 return false, fmt.Errorf("Expected a stopped token %q but found: %v", token, vaultClient.StoppedTokens()) 1371 } 1372 1373 if a := vaultClient.StoppedTokens()[0]; a != token { 1374 return false, fmt.Errorf("got stopped token %q; want %q", a, token) 1375 } 1376 return true, nil 1377 }, func(err error) { 1378 require.Fail(t, err.Error()) 1379 }) 1380 } 1381 1382 // TestTaskRunner_DeriveToken_Retry asserts that if a recoverable error is 1383 // returned when deriving a vault token a task will continue to block while 1384 // it's retried. 1385 func TestTaskRunner_DeriveToken_Retry(t *testing.T) { 1386 t.Parallel() 1387 alloc := mock.BatchAlloc() 1388 task := alloc.Job.TaskGroups[0].Tasks[0] 1389 task.Vault = &structs.Vault{Policies: []string{"default"}} 1390 1391 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1392 defer cleanup() 1393 1394 // Fail on the first attempt to derive a vault token 1395 token := "1234" 1396 count := 0 1397 handler := func(*structs.Allocation, []string) (map[string]string, error) { 1398 if count > 0 { 1399 return map[string]string{task.Name: token}, nil 1400 } 1401 1402 count++ 1403 return nil, structs.NewRecoverableError(fmt.Errorf("Want a retry"), true) 1404 } 1405 vaultClient := conf.Vault.(*vaultclient.MockVaultClient) 1406 vaultClient.DeriveTokenFn = handler 1407 1408 tr, err := NewTaskRunner(conf) 1409 require.NoError(t, err) 1410 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1411 go tr.Run() 1412 1413 // Wait for TR to exit and check its state 1414 select { 1415 case <-tr.WaitCh(): 1416 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1417 require.Fail(t, "timed out waiting for task runner to exit") 1418 } 1419 1420 state := tr.TaskState() 1421 require.Equal(t, structs.TaskStateDead, state.State) 1422 require.False(t, state.Failed) 1423 1424 require.Equal(t, 1, count) 1425 1426 // Check that the token is on disk 1427 tokenPath := filepath.Join(conf.TaskDir.SecretsDir, vaultTokenFile) 1428 data, err := ioutil.ReadFile(tokenPath) 1429 require.NoError(t, err) 1430 require.Equal(t, token, string(data)) 1431 1432 // Check the token was revoked 1433 testutil.WaitForResult(func() (bool, error) { 1434 if len(vaultClient.StoppedTokens()) != 1 { 1435 return false, fmt.Errorf("Expected a stopped token: %v", vaultClient.StoppedTokens()) 1436 } 1437 1438 if a := vaultClient.StoppedTokens()[0]; a != token { 1439 return false, fmt.Errorf("got stopped token %q; want %q", a, token) 1440 } 1441 return true, nil 1442 }, func(err error) { 1443 require.Fail(t, err.Error()) 1444 }) 1445 } 1446 1447 // TestTaskRunner_DeriveToken_Unrecoverable asserts that an unrecoverable error 1448 // from deriving a vault token will fail a task. 1449 func TestTaskRunner_DeriveToken_Unrecoverable(t *testing.T) { 1450 t.Parallel() 1451 1452 // Use a batch job with no restarts 1453 alloc := mock.BatchAlloc() 1454 tg := alloc.Job.TaskGroups[0] 1455 tg.RestartPolicy.Attempts = 0 1456 tg.RestartPolicy.Interval = 0 1457 tg.RestartPolicy.Delay = 0 1458 tg.RestartPolicy.Mode = structs.RestartPolicyModeFail 1459 task := tg.Tasks[0] 1460 task.Config = map[string]interface{}{ 1461 "run_for": "0s", 1462 } 1463 task.Vault = &structs.Vault{Policies: []string{"default"}} 1464 1465 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1466 defer cleanup() 1467 1468 // Error the token derivation 1469 vaultClient := conf.Vault.(*vaultclient.MockVaultClient) 1470 vaultClient.SetDeriveTokenError(alloc.ID, []string{task.Name}, fmt.Errorf("Non recoverable")) 1471 1472 tr, err := NewTaskRunner(conf) 1473 require.NoError(t, err) 1474 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1475 go tr.Run() 1476 1477 // Wait for the task to die 1478 select { 1479 case <-tr.WaitCh(): 1480 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1481 require.Fail(t, "timed out waiting for task runner to fail") 1482 } 1483 1484 // Task should be dead and last event should have failed task 1485 state := tr.TaskState() 1486 require.Equal(t, structs.TaskStateDead, state.State) 1487 require.True(t, state.Failed) 1488 require.Len(t, state.Events, 3) 1489 require.True(t, state.Events[2].FailsTask) 1490 } 1491 1492 // TestTaskRunner_Download_ChrootExec asserts that downloaded artifacts may be 1493 // executed in a chroot. 1494 func TestTaskRunner_Download_ChrootExec(t *testing.T) { 1495 t.Parallel() 1496 ctestutil.ExecCompatible(t) 1497 1498 ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir(".")))) 1499 defer ts.Close() 1500 1501 // Create a task that downloads a script and executes it. 1502 alloc := mock.BatchAlloc() 1503 alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{} 1504 task := alloc.Job.TaskGroups[0].Tasks[0] 1505 task.RestartPolicy = &structs.RestartPolicy{} 1506 task.Driver = "exec" 1507 task.Config = map[string]interface{}{ 1508 "command": "noop.sh", 1509 } 1510 task.Artifacts = []*structs.TaskArtifact{ 1511 { 1512 GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL), 1513 GetterMode: "file", 1514 RelativeDest: "noop.sh", 1515 }, 1516 } 1517 1518 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 1519 defer cleanup() 1520 1521 // Wait for task to run and exit 1522 select { 1523 case <-tr.WaitCh(): 1524 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1525 require.Fail(t, "timed out waiting for task runner to exit") 1526 } 1527 1528 state := tr.TaskState() 1529 require.Equal(t, structs.TaskStateDead, state.State) 1530 require.False(t, state.Failed) 1531 } 1532 1533 // TestTaskRunner_Download_Exec asserts that downloaded artifacts may be 1534 // executed in a driver without filesystem isolation. 1535 func TestTaskRunner_Download_RawExec(t *testing.T) { 1536 t.Parallel() 1537 1538 ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir(".")))) 1539 defer ts.Close() 1540 1541 // Create a task that downloads a script and executes it. 1542 alloc := mock.BatchAlloc() 1543 alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{} 1544 task := alloc.Job.TaskGroups[0].Tasks[0] 1545 task.RestartPolicy = &structs.RestartPolicy{} 1546 task.Driver = "raw_exec" 1547 task.Config = map[string]interface{}{ 1548 "command": "noop.sh", 1549 } 1550 task.Artifacts = []*structs.TaskArtifact{ 1551 { 1552 GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL), 1553 GetterMode: "file", 1554 RelativeDest: "noop.sh", 1555 }, 1556 } 1557 1558 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 1559 defer cleanup() 1560 1561 // Wait for task to run and exit 1562 select { 1563 case <-tr.WaitCh(): 1564 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1565 require.Fail(t, "timed out waiting for task runner to exit") 1566 } 1567 1568 state := tr.TaskState() 1569 require.Equal(t, structs.TaskStateDead, state.State) 1570 require.False(t, state.Failed) 1571 } 1572 1573 // TestTaskRunner_Download_List asserts that multiple artificats are downloaded 1574 // before a task is run. 1575 func TestTaskRunner_Download_List(t *testing.T) { 1576 t.Parallel() 1577 ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir(".")))) 1578 defer ts.Close() 1579 1580 // Create an allocation that has a task with a list of artifacts. 1581 alloc := mock.BatchAlloc() 1582 task := alloc.Job.TaskGroups[0].Tasks[0] 1583 f1 := "task_runner_test.go" 1584 f2 := "task_runner.go" 1585 artifact1 := structs.TaskArtifact{ 1586 GetterSource: fmt.Sprintf("%s/%s", ts.URL, f1), 1587 } 1588 artifact2 := structs.TaskArtifact{ 1589 GetterSource: fmt.Sprintf("%s/%s", ts.URL, f2), 1590 } 1591 task.Artifacts = []*structs.TaskArtifact{&artifact1, &artifact2} 1592 1593 tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name) 1594 defer cleanup() 1595 1596 // Wait for task to run and exit 1597 select { 1598 case <-tr.WaitCh(): 1599 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1600 require.Fail(t, "timed out waiting for task runner to exit") 1601 } 1602 1603 state := tr.TaskState() 1604 require.Equal(t, structs.TaskStateDead, state.State) 1605 require.False(t, state.Failed) 1606 1607 require.Len(t, state.Events, 5) 1608 assert.Equal(t, structs.TaskReceived, state.Events[0].Type) 1609 assert.Equal(t, structs.TaskSetup, state.Events[1].Type) 1610 assert.Equal(t, structs.TaskDownloadingArtifacts, state.Events[2].Type) 1611 assert.Equal(t, structs.TaskStarted, state.Events[3].Type) 1612 assert.Equal(t, structs.TaskTerminated, state.Events[4].Type) 1613 1614 // Check that both files exist. 1615 _, err := os.Stat(filepath.Join(conf.TaskDir.Dir, f1)) 1616 require.NoErrorf(t, err, "%v not downloaded", f1) 1617 1618 _, err = os.Stat(filepath.Join(conf.TaskDir.Dir, f2)) 1619 require.NoErrorf(t, err, "%v not downloaded", f2) 1620 } 1621 1622 // TestTaskRunner_Download_Retries asserts that failed artifact downloads are 1623 // retried according to the task's restart policy. 1624 func TestTaskRunner_Download_Retries(t *testing.T) { 1625 t.Parallel() 1626 1627 // Create an allocation that has a task with bad artifacts. 1628 alloc := mock.BatchAlloc() 1629 task := alloc.Job.TaskGroups[0].Tasks[0] 1630 artifact := structs.TaskArtifact{ 1631 GetterSource: "http://127.0.0.1:0/foo/bar/baz", 1632 } 1633 task.Artifacts = []*structs.TaskArtifact{&artifact} 1634 1635 // Make the restart policy retry once 1636 rp := &structs.RestartPolicy{ 1637 Attempts: 1, 1638 Interval: 10 * time.Minute, 1639 Delay: 1 * time.Second, 1640 Mode: structs.RestartPolicyModeFail, 1641 } 1642 alloc.Job.TaskGroups[0].RestartPolicy = rp 1643 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp 1644 1645 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 1646 defer cleanup() 1647 1648 select { 1649 case <-tr.WaitCh(): 1650 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1651 require.Fail(t, "timed out waiting for task to exit") 1652 } 1653 1654 state := tr.TaskState() 1655 require.Equal(t, structs.TaskStateDead, state.State) 1656 require.True(t, state.Failed) 1657 require.Len(t, state.Events, 8, pretty.Sprint(state.Events)) 1658 require.Equal(t, structs.TaskReceived, state.Events[0].Type) 1659 require.Equal(t, structs.TaskSetup, state.Events[1].Type) 1660 require.Equal(t, structs.TaskDownloadingArtifacts, state.Events[2].Type) 1661 require.Equal(t, structs.TaskArtifactDownloadFailed, state.Events[3].Type) 1662 require.Equal(t, structs.TaskRestarting, state.Events[4].Type) 1663 require.Equal(t, structs.TaskDownloadingArtifacts, state.Events[5].Type) 1664 require.Equal(t, structs.TaskArtifactDownloadFailed, state.Events[6].Type) 1665 require.Equal(t, structs.TaskNotRestarting, state.Events[7].Type) 1666 } 1667 1668 // TestTaskRunner_DriverNetwork asserts that a driver's network is properly 1669 // used in services and checks. 1670 func TestTaskRunner_DriverNetwork(t *testing.T) { 1671 t.Parallel() 1672 1673 alloc := mock.Alloc() 1674 task := alloc.Job.TaskGroups[0].Tasks[0] 1675 task.Driver = "mock_driver" 1676 task.Config = map[string]interface{}{ 1677 "run_for": "100s", 1678 "driver_ip": "10.1.2.3", 1679 "driver_port_map": "http:80", 1680 } 1681 1682 // Create services and checks with custom address modes to exercise 1683 // address detection logic 1684 task.Services = []*structs.Service{ 1685 { 1686 Name: "host-service", 1687 PortLabel: "http", 1688 AddressMode: "host", 1689 Checks: []*structs.ServiceCheck{ 1690 { 1691 Name: "driver-check", 1692 Type: "tcp", 1693 PortLabel: "1234", 1694 AddressMode: "driver", 1695 }, 1696 }, 1697 }, 1698 { 1699 Name: "driver-service", 1700 PortLabel: "5678", 1701 AddressMode: "driver", 1702 Checks: []*structs.ServiceCheck{ 1703 { 1704 Name: "host-check", 1705 Type: "tcp", 1706 PortLabel: "http", 1707 }, 1708 { 1709 Name: "driver-label-check", 1710 Type: "tcp", 1711 PortLabel: "http", 1712 AddressMode: "driver", 1713 }, 1714 }, 1715 }, 1716 } 1717 1718 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1719 defer cleanup() 1720 1721 // Use a mock agent to test for services 1722 consulAgent := agentconsul.NewMockAgent() 1723 consulClient := agentconsul.NewServiceClient(consulAgent, conf.Logger, true) 1724 defer consulClient.Shutdown() 1725 go consulClient.Run() 1726 1727 conf.Consul = consulClient 1728 1729 tr, err := NewTaskRunner(conf) 1730 require.NoError(t, err) 1731 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1732 go tr.Run() 1733 1734 // Wait for the task to start 1735 testWaitForTaskToStart(t, tr) 1736 1737 testutil.WaitForResult(func() (bool, error) { 1738 services, _ := consulAgent.Services() 1739 if n := len(services); n != 2 { 1740 return false, fmt.Errorf("expected 2 services, but found %d", n) 1741 } 1742 for _, s := range services { 1743 switch s.Service { 1744 case "host-service": 1745 if expected := "192.168.0.100"; s.Address != expected { 1746 return false, fmt.Errorf("expected host-service to have IP=%s but found %s", 1747 expected, s.Address) 1748 } 1749 case "driver-service": 1750 if expected := "10.1.2.3"; s.Address != expected { 1751 return false, fmt.Errorf("expected driver-service to have IP=%s but found %s", 1752 expected, s.Address) 1753 } 1754 if expected := 5678; s.Port != expected { 1755 return false, fmt.Errorf("expected driver-service to have port=%d but found %d", 1756 expected, s.Port) 1757 } 1758 default: 1759 return false, fmt.Errorf("unexpected service: %q", s.Service) 1760 } 1761 1762 } 1763 1764 checks := consulAgent.CheckRegs() 1765 if n := len(checks); n != 3 { 1766 return false, fmt.Errorf("expected 3 checks, but found %d", n) 1767 } 1768 for _, check := range checks { 1769 switch check.Name { 1770 case "driver-check": 1771 if expected := "10.1.2.3:1234"; check.TCP != expected { 1772 return false, fmt.Errorf("expected driver-check to have address %q but found %q", expected, check.TCP) 1773 } 1774 case "driver-label-check": 1775 if expected := "10.1.2.3:80"; check.TCP != expected { 1776 return false, fmt.Errorf("expected driver-label-check to have address %q but found %q", expected, check.TCP) 1777 } 1778 case "host-check": 1779 if expected := "192.168.0.100:"; !strings.HasPrefix(check.TCP, expected) { 1780 return false, fmt.Errorf("expected host-check to have address start with %q but found %q", expected, check.TCP) 1781 } 1782 default: 1783 return false, fmt.Errorf("unexpected check: %q", check.Name) 1784 } 1785 } 1786 1787 return true, nil 1788 }, func(err error) { 1789 services, _ := consulAgent.Services() 1790 for _, s := range services { 1791 t.Logf(pretty.Sprint("Service: ", s)) 1792 } 1793 for _, c := range consulAgent.CheckRegs() { 1794 t.Logf(pretty.Sprint("Check: ", c)) 1795 } 1796 require.NoError(t, err) 1797 }) 1798 } 1799 1800 // TestTaskRunner_RestartSignalTask_NotRunning asserts resilience to failures 1801 // when a restart or signal is triggered and the task is not running. 1802 func TestTaskRunner_RestartSignalTask_NotRunning(t *testing.T) { 1803 t.Parallel() 1804 1805 alloc := mock.BatchAlloc() 1806 task := alloc.Job.TaskGroups[0].Tasks[0] 1807 task.Driver = "mock_driver" 1808 task.Config = map[string]interface{}{ 1809 "run_for": "0s", 1810 } 1811 1812 // Use vault to block the start 1813 task.Vault = &structs.Vault{Policies: []string{"default"}} 1814 1815 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1816 defer cleanup() 1817 1818 // Control when we get a Vault token 1819 waitCh := make(chan struct{}, 1) 1820 defer close(waitCh) 1821 handler := func(*structs.Allocation, []string) (map[string]string, error) { 1822 <-waitCh 1823 return map[string]string{task.Name: "1234"}, nil 1824 } 1825 vaultClient := conf.Vault.(*vaultclient.MockVaultClient) 1826 vaultClient.DeriveTokenFn = handler 1827 1828 tr, err := NewTaskRunner(conf) 1829 require.NoError(t, err) 1830 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1831 go tr.Run() 1832 1833 select { 1834 case <-tr.WaitCh(): 1835 require.Fail(t, "unexpected exit") 1836 case <-time.After(1 * time.Second): 1837 } 1838 1839 // Send a signal and restart 1840 err = tr.Signal(structs.NewTaskEvent("don't panic"), "QUIT") 1841 require.EqualError(t, err, ErrTaskNotRunning.Error()) 1842 1843 // Send a restart 1844 err = tr.Restart(context.Background(), structs.NewTaskEvent("don't panic"), false) 1845 require.EqualError(t, err, ErrTaskNotRunning.Error()) 1846 1847 // Unblock and let it finish 1848 waitCh <- struct{}{} 1849 1850 select { 1851 case <-tr.WaitCh(): 1852 case <-time.After(10 * time.Second): 1853 require.Fail(t, "timed out waiting for task to complete") 1854 } 1855 1856 // Assert the task ran and never restarted 1857 state := tr.TaskState() 1858 require.Equal(t, structs.TaskStateDead, state.State) 1859 require.False(t, state.Failed) 1860 require.Len(t, state.Events, 4, pretty.Sprint(state.Events)) 1861 require.Equal(t, structs.TaskReceived, state.Events[0].Type) 1862 require.Equal(t, structs.TaskSetup, state.Events[1].Type) 1863 require.Equal(t, structs.TaskStarted, state.Events[2].Type) 1864 require.Equal(t, structs.TaskTerminated, state.Events[3].Type) 1865 } 1866 1867 // TestTaskRunner_Run_RecoverableStartError asserts tasks are restarted if they 1868 // return a recoverable error from StartTask. 1869 func TestTaskRunner_Run_RecoverableStartError(t *testing.T) { 1870 t.Parallel() 1871 1872 alloc := mock.BatchAlloc() 1873 task := alloc.Job.TaskGroups[0].Tasks[0] 1874 task.Config = map[string]interface{}{ 1875 "start_error": "driver failure", 1876 "start_error_recoverable": true, 1877 } 1878 1879 // Make the restart policy retry once 1880 rp := &structs.RestartPolicy{ 1881 Attempts: 1, 1882 Interval: 10 * time.Minute, 1883 Delay: 0, 1884 Mode: structs.RestartPolicyModeFail, 1885 } 1886 alloc.Job.TaskGroups[0].RestartPolicy = rp 1887 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp 1888 1889 tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name) 1890 defer cleanup() 1891 1892 select { 1893 case <-tr.WaitCh(): 1894 case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): 1895 require.Fail(t, "timed out waiting for task to exit") 1896 } 1897 1898 state := tr.TaskState() 1899 require.Equal(t, structs.TaskStateDead, state.State) 1900 require.True(t, state.Failed) 1901 require.Len(t, state.Events, 6, pretty.Sprint(state.Events)) 1902 require.Equal(t, structs.TaskReceived, state.Events[0].Type) 1903 require.Equal(t, structs.TaskSetup, state.Events[1].Type) 1904 require.Equal(t, structs.TaskDriverFailure, state.Events[2].Type) 1905 require.Equal(t, structs.TaskRestarting, state.Events[3].Type) 1906 require.Equal(t, structs.TaskDriverFailure, state.Events[4].Type) 1907 require.Equal(t, structs.TaskNotRestarting, state.Events[5].Type) 1908 } 1909 1910 // TestTaskRunner_Template_Artifact asserts that tasks can use artifacts as templates. 1911 func TestTaskRunner_Template_Artifact(t *testing.T) { 1912 t.Parallel() 1913 1914 ts := httptest.NewServer(http.FileServer(http.Dir("."))) 1915 defer ts.Close() 1916 1917 alloc := mock.BatchAlloc() 1918 task := alloc.Job.TaskGroups[0].Tasks[0] 1919 f1 := "task_runner.go" 1920 f2 := "test" 1921 task.Artifacts = []*structs.TaskArtifact{ 1922 {GetterSource: fmt.Sprintf("%s/%s", ts.URL, f1)}, 1923 } 1924 task.Templates = []*structs.Template{ 1925 { 1926 SourcePath: f1, 1927 DestPath: "local/test", 1928 ChangeMode: structs.TemplateChangeModeNoop, 1929 }, 1930 } 1931 1932 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1933 defer cleanup() 1934 1935 tr, err := NewTaskRunner(conf) 1936 require.NoError(t, err) 1937 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 1938 go tr.Run() 1939 1940 // Wait for task to run and exit 1941 select { 1942 case <-tr.WaitCh(): 1943 case <-time.After(15 * time.Second * time.Duration(testutil.TestMultiplier())): 1944 require.Fail(t, "timed out waiting for task runner to exit") 1945 } 1946 1947 state := tr.TaskState() 1948 require.Equal(t, structs.TaskStateDead, state.State) 1949 require.True(t, state.Successful()) 1950 require.False(t, state.Failed) 1951 1952 artifactsDownloaded := false 1953 for _, e := range state.Events { 1954 if e.Type == structs.TaskDownloadingArtifacts { 1955 artifactsDownloaded = true 1956 } 1957 } 1958 assert.True(t, artifactsDownloaded, "expected artifacts downloaded events") 1959 1960 // Check that both files exist. 1961 _, err = os.Stat(filepath.Join(conf.TaskDir.Dir, f1)) 1962 require.NoErrorf(t, err, "%v not downloaded", f1) 1963 1964 _, err = os.Stat(filepath.Join(conf.TaskDir.LocalDir, f2)) 1965 require.NoErrorf(t, err, "%v not rendered", f2) 1966 } 1967 1968 // TestTaskRunner_Template_BlockingPreStart asserts that a template 1969 // that fails to render in PreStart can gracefully be shutdown by 1970 // either killCtx or shutdownCtx 1971 func TestTaskRunner_Template_BlockingPreStart(t *testing.T) { 1972 t.Parallel() 1973 1974 alloc := mock.BatchAlloc() 1975 task := alloc.Job.TaskGroups[0].Tasks[0] 1976 task.Templates = []*structs.Template{ 1977 { 1978 EmbeddedTmpl: `{{ with secret "foo/secret" }}{{ .Data.certificate }}{{ end }}`, 1979 DestPath: "local/test", 1980 ChangeMode: structs.TemplateChangeModeNoop, 1981 }, 1982 } 1983 1984 task.Vault = &structs.Vault{Policies: []string{"default"}} 1985 1986 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 1987 defer cleanup() 1988 1989 tr, err := NewTaskRunner(conf) 1990 require.NoError(t, err) 1991 go tr.Run() 1992 defer tr.Shutdown() 1993 1994 testutil.WaitForResult(func() (bool, error) { 1995 ts := tr.TaskState() 1996 1997 if len(ts.Events) == 0 { 1998 return false, fmt.Errorf("no events yet") 1999 } 2000 2001 for _, e := range ts.Events { 2002 if e.Type == "Template" && strings.Contains(e.DisplayMessage, "vault.read(foo/secret)") { 2003 return true, nil 2004 } 2005 } 2006 2007 return false, fmt.Errorf("no missing vault secret template event yet: %#v", ts.Events) 2008 2009 }, func(err error) { 2010 require.NoError(t, err) 2011 }) 2012 2013 shutdown := func() <-chan bool { 2014 finished := make(chan bool) 2015 go func() { 2016 tr.Shutdown() 2017 finished <- true 2018 }() 2019 2020 return finished 2021 } 2022 2023 select { 2024 case <-shutdown(): 2025 // it shut down like it should have 2026 case <-time.After(10 * time.Second): 2027 require.Fail(t, "timeout shutting down task") 2028 } 2029 } 2030 2031 // TestTaskRunner_Template_NewVaultToken asserts that a new vault token is 2032 // created when rendering template and that it is revoked on alloc completion 2033 func TestTaskRunner_Template_NewVaultToken(t *testing.T) { 2034 t.Parallel() 2035 2036 alloc := mock.BatchAlloc() 2037 task := alloc.Job.TaskGroups[0].Tasks[0] 2038 task.Templates = []*structs.Template{ 2039 { 2040 EmbeddedTmpl: `{{key "foo"}}`, 2041 DestPath: "local/test", 2042 ChangeMode: structs.TemplateChangeModeNoop, 2043 }, 2044 } 2045 task.Vault = &structs.Vault{Policies: []string{"default"}} 2046 2047 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 2048 defer cleanup() 2049 2050 tr, err := NewTaskRunner(conf) 2051 require.NoError(t, err) 2052 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 2053 go tr.Run() 2054 2055 // Wait for a Vault token 2056 var token string 2057 testutil.WaitForResult(func() (bool, error) { 2058 token = tr.getVaultToken() 2059 2060 if token == "" { 2061 return false, fmt.Errorf("No Vault token") 2062 } 2063 2064 return true, nil 2065 }, func(err error) { 2066 require.NoError(t, err) 2067 }) 2068 2069 vault := conf.Vault.(*vaultclient.MockVaultClient) 2070 renewalCh, ok := vault.RenewTokens()[token] 2071 require.True(t, ok, "no renewal channel for token") 2072 2073 renewalCh <- fmt.Errorf("Test killing") 2074 close(renewalCh) 2075 2076 var token2 string 2077 testutil.WaitForResult(func() (bool, error) { 2078 token2 = tr.getVaultToken() 2079 2080 if token2 == "" { 2081 return false, fmt.Errorf("No Vault token") 2082 } 2083 2084 if token2 == token { 2085 return false, fmt.Errorf("token wasn't recreated") 2086 } 2087 2088 return true, nil 2089 }, func(err error) { 2090 require.NoError(t, err) 2091 }) 2092 2093 // Check the token was revoked 2094 testutil.WaitForResult(func() (bool, error) { 2095 if len(vault.StoppedTokens()) != 1 { 2096 return false, fmt.Errorf("Expected a stopped token: %v", vault.StoppedTokens()) 2097 } 2098 2099 if a := vault.StoppedTokens()[0]; a != token { 2100 return false, fmt.Errorf("got stopped token %q; want %q", a, token) 2101 } 2102 2103 return true, nil 2104 }, func(err error) { 2105 require.NoError(t, err) 2106 }) 2107 2108 } 2109 2110 // TestTaskRunner_VaultManager_Restart asserts that the alloc is restarted when the alloc 2111 // derived vault token expires, when task is configured with Restart change mode 2112 func TestTaskRunner_VaultManager_Restart(t *testing.T) { 2113 t.Parallel() 2114 2115 alloc := mock.BatchAlloc() 2116 task := alloc.Job.TaskGroups[0].Tasks[0] 2117 task.Config = map[string]interface{}{ 2118 "run_for": "10s", 2119 } 2120 task.Vault = &structs.Vault{ 2121 Policies: []string{"default"}, 2122 ChangeMode: structs.VaultChangeModeRestart, 2123 } 2124 2125 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 2126 defer cleanup() 2127 2128 tr, err := NewTaskRunner(conf) 2129 require.NoError(t, err) 2130 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 2131 go tr.Run() 2132 2133 testWaitForTaskToStart(t, tr) 2134 2135 tr.vaultTokenLock.Lock() 2136 token := tr.vaultToken 2137 tr.vaultTokenLock.Unlock() 2138 2139 require.NotEmpty(t, token) 2140 2141 vault := conf.Vault.(*vaultclient.MockVaultClient) 2142 renewalCh, ok := vault.RenewTokens()[token] 2143 require.True(t, ok, "no renewal channel for token") 2144 2145 renewalCh <- fmt.Errorf("Test killing") 2146 close(renewalCh) 2147 2148 testutil.WaitForResult(func() (bool, error) { 2149 state := tr.TaskState() 2150 2151 if len(state.Events) == 0 { 2152 return false, fmt.Errorf("no events yet") 2153 } 2154 2155 foundRestartSignal, foundRestarting := false, false 2156 for _, e := range state.Events { 2157 switch e.Type { 2158 case structs.TaskRestartSignal: 2159 foundRestartSignal = true 2160 case structs.TaskRestarting: 2161 foundRestarting = true 2162 } 2163 } 2164 2165 if !foundRestartSignal { 2166 return false, fmt.Errorf("no restart signal event yet: %#v", state.Events) 2167 } 2168 2169 if !foundRestarting { 2170 return false, fmt.Errorf("no restarting event yet: %#v", state.Events) 2171 } 2172 2173 lastEvent := state.Events[len(state.Events)-1] 2174 if lastEvent.Type != structs.TaskStarted { 2175 return false, fmt.Errorf("expected last event to be task starting but was %#v", lastEvent) 2176 } 2177 return true, nil 2178 }, func(err error) { 2179 require.NoError(t, err) 2180 }) 2181 } 2182 2183 // TestTaskRunner_VaultManager_Signal asserts that the alloc is signalled when the alloc 2184 // derived vault token expires, when task is configured with signal change mode 2185 func TestTaskRunner_VaultManager_Signal(t *testing.T) { 2186 t.Parallel() 2187 2188 alloc := mock.BatchAlloc() 2189 task := alloc.Job.TaskGroups[0].Tasks[0] 2190 task.Config = map[string]interface{}{ 2191 "run_for": "10s", 2192 } 2193 task.Vault = &structs.Vault{ 2194 Policies: []string{"default"}, 2195 ChangeMode: structs.VaultChangeModeSignal, 2196 ChangeSignal: "SIGUSR1", 2197 } 2198 2199 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 2200 defer cleanup() 2201 2202 tr, err := NewTaskRunner(conf) 2203 require.NoError(t, err) 2204 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 2205 go tr.Run() 2206 2207 testWaitForTaskToStart(t, tr) 2208 2209 tr.vaultTokenLock.Lock() 2210 token := tr.vaultToken 2211 tr.vaultTokenLock.Unlock() 2212 2213 require.NotEmpty(t, token) 2214 2215 vault := conf.Vault.(*vaultclient.MockVaultClient) 2216 renewalCh, ok := vault.RenewTokens()[token] 2217 require.True(t, ok, "no renewal channel for token") 2218 2219 renewalCh <- fmt.Errorf("Test killing") 2220 close(renewalCh) 2221 2222 testutil.WaitForResult(func() (bool, error) { 2223 state := tr.TaskState() 2224 2225 if len(state.Events) == 0 { 2226 return false, fmt.Errorf("no events yet") 2227 } 2228 2229 foundSignaling := false 2230 for _, e := range state.Events { 2231 if e.Type == structs.TaskSignaling { 2232 foundSignaling = true 2233 } 2234 } 2235 2236 if !foundSignaling { 2237 return false, fmt.Errorf("no signaling event yet: %#v", state.Events) 2238 } 2239 2240 return true, nil 2241 }, func(err error) { 2242 require.NoError(t, err) 2243 }) 2244 2245 } 2246 2247 // TestTaskRunner_UnregisterConsul_Retries asserts a task is unregistered from 2248 // Consul when waiting to be retried. 2249 func TestTaskRunner_UnregisterConsul_Retries(t *testing.T) { 2250 t.Parallel() 2251 2252 alloc := mock.Alloc() 2253 // Make the restart policy try one ctx.update 2254 rp := &structs.RestartPolicy{ 2255 Attempts: 1, 2256 Interval: 10 * time.Minute, 2257 Delay: time.Nanosecond, 2258 Mode: structs.RestartPolicyModeFail, 2259 } 2260 alloc.Job.TaskGroups[0].RestartPolicy = rp 2261 task := alloc.Job.TaskGroups[0].Tasks[0] 2262 task.RestartPolicy = rp 2263 task.Driver = "mock_driver" 2264 task.Config = map[string]interface{}{ 2265 "exit_code": "1", 2266 "run_for": "1ns", 2267 } 2268 2269 conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 2270 defer cleanup() 2271 2272 tr, err := NewTaskRunner(conf) 2273 require.NoError(t, err) 2274 defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup")) 2275 tr.Run() 2276 2277 state := tr.TaskState() 2278 require.Equal(t, structs.TaskStateDead, state.State) 2279 2280 consul := conf.Consul.(*consulapi.MockConsulServiceClient) 2281 consulOps := consul.GetOps() 2282 require.Len(t, consulOps, 8) 2283 2284 // Initial add 2285 require.Equal(t, "add", consulOps[0].Op) 2286 2287 // Removing canary and non-canary entries on first exit 2288 require.Equal(t, "remove", consulOps[1].Op) 2289 require.Equal(t, "remove", consulOps[2].Op) 2290 2291 // Second add on retry 2292 require.Equal(t, "add", consulOps[3].Op) 2293 2294 // Removing canary and non-canary entries on retry 2295 require.Equal(t, "remove", consulOps[4].Op) 2296 require.Equal(t, "remove", consulOps[5].Op) 2297 2298 // Removing canary and non-canary entries on stop 2299 require.Equal(t, "remove", consulOps[6].Op) 2300 require.Equal(t, "remove", consulOps[7].Op) 2301 } 2302 2303 // testWaitForTaskToStart waits for the task to be running or fails the test 2304 func testWaitForTaskToStart(t *testing.T, tr *TaskRunner) { 2305 testutil.WaitForResult(func() (bool, error) { 2306 ts := tr.TaskState() 2307 return ts.State == structs.TaskStateRunning, fmt.Errorf("%v", ts.State) 2308 }, func(err error) { 2309 require.NoError(t, err) 2310 }) 2311 } 2312 2313 // TestTaskRunner_BaseLabels tests that the base labels for the task metrics 2314 // are set appropriately. 2315 func TestTaskRunner_BaseLabels(t *testing.T) { 2316 t.Parallel() 2317 require := require.New(t) 2318 2319 alloc := mock.BatchAlloc() 2320 alloc.Namespace = "not-default" 2321 task := alloc.Job.TaskGroups[0].Tasks[0] 2322 task.Driver = "raw_exec" 2323 task.Config = map[string]interface{}{ 2324 "command": "whoami", 2325 } 2326 2327 config, cleanup := testTaskRunnerConfig(t, alloc, task.Name) 2328 defer cleanup() 2329 2330 tr, err := NewTaskRunner(config) 2331 require.NoError(err) 2332 2333 labels := map[string]string{} 2334 for _, e := range tr.baseLabels { 2335 labels[e.Name] = e.Value 2336 } 2337 require.Equal(alloc.Job.Name, labels["job"]) 2338 require.Equal(alloc.TaskGroup, labels["task_group"]) 2339 require.Equal(task.Name, labels["task"]) 2340 require.Equal(alloc.ID, labels["alloc_id"]) 2341 require.Equal(alloc.Namespace, labels["namespace"]) 2342 }