github.com/smithx10/nomad@v0.9.1-rc1/client/allocrunner/alloc_runner_test.go (about) 1 package allocrunner 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "path/filepath" 8 "testing" 9 "time" 10 11 "github.com/hashicorp/consul/api" 12 "github.com/hashicorp/nomad/client/allochealth" 13 "github.com/hashicorp/nomad/client/allocwatcher" 14 cconsul "github.com/hashicorp/nomad/client/consul" 15 "github.com/hashicorp/nomad/client/state" 16 "github.com/hashicorp/nomad/command/agent/consul" 17 "github.com/hashicorp/nomad/helper/uuid" 18 "github.com/hashicorp/nomad/nomad/mock" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/nomad/testutil" 21 "github.com/stretchr/testify/require" 22 ) 23 24 // destroy does a blocking destroy on an alloc runner 25 func destroy(ar *allocRunner) { 26 ar.Destroy() 27 <-ar.DestroyCh() 28 } 29 30 // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via 31 // AllocState() are initialized even before the AllocRunner has run. 32 func TestAllocRunner_AllocState_Initialized(t *testing.T) { 33 t.Parallel() 34 35 alloc := mock.Alloc() 36 alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 37 conf, cleanup := testAllocRunnerConfig(t, alloc) 38 defer cleanup() 39 40 ar, err := NewAllocRunner(conf) 41 require.NoError(t, err) 42 43 allocState := ar.AllocState() 44 45 require.NotNil(t, allocState) 46 require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name]) 47 } 48 49 // TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the 50 // entire task group is killed. 51 func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { 52 t.Parallel() 53 54 alloc := mock.BatchAlloc() 55 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 56 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 57 58 // Create two tasks in the task group 59 task := alloc.Job.TaskGroups[0].Tasks[0] 60 task.Name = "task1" 61 task.Driver = "mock_driver" 62 task.KillTimeout = 10 * time.Millisecond 63 task.Config = map[string]interface{}{ 64 "run_for": "10s", 65 } 66 67 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 68 task2.Name = "task2" 69 task2.Driver = "mock_driver" 70 task2.Leader = true 71 task2.Config = map[string]interface{}{ 72 "run_for": "1s", 73 } 74 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 75 alloc.AllocatedResources.Tasks[task.Name] = tr 76 alloc.AllocatedResources.Tasks[task2.Name] = tr 77 78 conf, cleanup := testAllocRunnerConfig(t, alloc) 79 defer cleanup() 80 ar, err := NewAllocRunner(conf) 81 require.NoError(t, err) 82 defer destroy(ar) 83 go ar.Run() 84 85 // Wait for all tasks to be killed 86 upd := conf.StateUpdater.(*MockStateUpdater) 87 testutil.WaitForResult(func() (bool, error) { 88 last := upd.Last() 89 if last == nil { 90 return false, fmt.Errorf("No updates") 91 } 92 if last.ClientStatus != structs.AllocClientStatusComplete { 93 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 94 } 95 96 // Task1 should be killed because Task2 exited 97 state1 := last.TaskStates[task.Name] 98 if state1.State != structs.TaskStateDead { 99 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 100 } 101 if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { 102 return false, fmt.Errorf("expected to have a start and finish time") 103 } 104 if len(state1.Events) < 2 { 105 // At least have a received and destroyed 106 return false, fmt.Errorf("Unexpected number of events") 107 } 108 109 found := false 110 for _, e := range state1.Events { 111 if e.Type != structs.TaskLeaderDead { 112 found = true 113 } 114 } 115 116 if !found { 117 return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead) 118 } 119 120 // Task Two should be dead 121 state2 := last.TaskStates[task2.Name] 122 if state2.State != structs.TaskStateDead { 123 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 124 } 125 if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { 126 return false, fmt.Errorf("expected to have a start and finish time") 127 } 128 129 return true, nil 130 }, func(err error) { 131 t.Fatalf("err: %v", err) 132 }) 133 } 134 135 // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a 136 // leader the leader is stopped before other tasks. 137 func TestAllocRunner_TaskLeader_StopTG(t *testing.T) { 138 t.Parallel() 139 140 alloc := mock.Alloc() 141 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 142 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 143 144 // Create 3 tasks in the task group 145 task := alloc.Job.TaskGroups[0].Tasks[0] 146 task.Name = "follower1" 147 task.Driver = "mock_driver" 148 task.Config = map[string]interface{}{ 149 "run_for": "10s", 150 } 151 152 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 153 task2.Name = "leader" 154 task2.Driver = "mock_driver" 155 task2.Leader = true 156 task2.Config = map[string]interface{}{ 157 "run_for": "10s", 158 } 159 160 task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 161 task3.Name = "follower2" 162 task3.Driver = "mock_driver" 163 task3.Config = map[string]interface{}{ 164 "run_for": "10s", 165 } 166 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3) 167 alloc.AllocatedResources.Tasks[task.Name] = tr 168 alloc.AllocatedResources.Tasks[task2.Name] = tr 169 alloc.AllocatedResources.Tasks[task3.Name] = tr 170 171 conf, cleanup := testAllocRunnerConfig(t, alloc) 172 defer cleanup() 173 ar, err := NewAllocRunner(conf) 174 require.NoError(t, err) 175 defer destroy(ar) 176 go ar.Run() 177 178 // Wait for tasks to start 179 upd := conf.StateUpdater.(*MockStateUpdater) 180 last := upd.Last() 181 testutil.WaitForResult(func() (bool, error) { 182 last = upd.Last() 183 if last == nil { 184 return false, fmt.Errorf("No updates") 185 } 186 if n := len(last.TaskStates); n != 3 { 187 return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n) 188 } 189 for name, state := range last.TaskStates { 190 if state.State != structs.TaskStateRunning { 191 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 192 } 193 } 194 return true, nil 195 }, func(err error) { 196 t.Fatalf("err: %v", err) 197 }) 198 199 // Reset updates 200 upd.Reset() 201 202 // Stop alloc 203 update := alloc.Copy() 204 update.DesiredStatus = structs.AllocDesiredStatusStop 205 ar.Update(update) 206 207 // Wait for tasks to stop 208 testutil.WaitForResult(func() (bool, error) { 209 last := upd.Last() 210 if last == nil { 211 return false, fmt.Errorf("No updates") 212 } 213 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() { 214 return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s", 215 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt) 216 } 217 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() { 218 return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s", 219 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt) 220 } 221 return true, nil 222 }, func(err error) { 223 last := upd.Last() 224 for name, state := range last.TaskStates { 225 t.Logf("%s: %s", name, state.State) 226 } 227 t.Fatalf("err: %v", err) 228 }) 229 } 230 231 // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a 232 // restored task group with a leader that failed before restoring the leader is 233 // not stopped as it does not exist. 234 // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932 235 func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) { 236 t.Parallel() 237 238 alloc := mock.Alloc() 239 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 240 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 241 242 // Create a leader and follower task in the task group 243 task := alloc.Job.TaskGroups[0].Tasks[0] 244 task.Name = "follower1" 245 task.Driver = "mock_driver" 246 task.KillTimeout = 10 * time.Second 247 task.Config = map[string]interface{}{ 248 "run_for": "10s", 249 } 250 251 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 252 task2.Name = "leader" 253 task2.Driver = "mock_driver" 254 task2.Leader = true 255 task2.KillTimeout = 10 * time.Millisecond 256 task2.Config = map[string]interface{}{ 257 "run_for": "10s", 258 } 259 260 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 261 alloc.AllocatedResources.Tasks[task.Name] = tr 262 alloc.AllocatedResources.Tasks[task2.Name] = tr 263 264 conf, cleanup := testAllocRunnerConfig(t, alloc) 265 defer cleanup() 266 267 // Use a memory backed statedb 268 conf.StateDB = state.NewMemDB(conf.Logger) 269 270 ar, err := NewAllocRunner(conf) 271 require.NoError(t, err) 272 273 // Mimic Nomad exiting before the leader stopping is able to stop other tasks. 274 ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) 275 ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 276 277 // Create a new AllocRunner to test RestoreState and Run 278 ar2, err := NewAllocRunner(conf) 279 require.NoError(t, err) 280 defer destroy(ar2) 281 282 if err := ar2.Restore(); err != nil { 283 t.Fatalf("error restoring state: %v", err) 284 } 285 ar2.Run() 286 287 // Wait for tasks to be stopped because leader is dead 288 testutil.WaitForResult(func() (bool, error) { 289 alloc := ar2.Alloc() 290 for task, state := range alloc.TaskStates { 291 if state.State != structs.TaskStateDead { 292 return false, fmt.Errorf("Task %q should be dead: %v", task, state.State) 293 } 294 } 295 return true, nil 296 }, func(err error) { 297 t.Fatalf("err: %v", err) 298 }) 299 300 // Make sure it GCs properly 301 ar2.Destroy() 302 303 select { 304 case <-ar2.DestroyCh(): 305 // exited as expected 306 case <-time.After(10 * time.Second): 307 t.Fatalf("timed out waiting for AR to GC") 308 } 309 } 310 311 func TestAllocRunner_Update_Semantics(t *testing.T) { 312 t.Parallel() 313 require := require.New(t) 314 315 updatedAlloc := func(a *structs.Allocation) *structs.Allocation { 316 upd := a.CopySkipJob() 317 upd.AllocModifyIndex++ 318 319 return upd 320 } 321 322 alloc := mock.Alloc() 323 alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 324 conf, cleanup := testAllocRunnerConfig(t, alloc) 325 defer cleanup() 326 327 ar, err := NewAllocRunner(conf) 328 require.NoError(err) 329 330 upd1 := updatedAlloc(alloc) 331 ar.Update(upd1) 332 333 // Update was placed into a queue 334 require.Len(ar.allocUpdatedCh, 1) 335 336 upd2 := updatedAlloc(alloc) 337 ar.Update(upd2) 338 339 // Allocation was _replaced_ 340 341 require.Len(ar.allocUpdatedCh, 1) 342 queuedAlloc := <-ar.allocUpdatedCh 343 require.Equal(upd2, queuedAlloc) 344 345 // Requeueing older alloc is skipped 346 ar.Update(upd2) 347 ar.Update(upd1) 348 349 queuedAlloc = <-ar.allocUpdatedCh 350 require.Equal(upd2, queuedAlloc) 351 352 // Ignore after watch closed 353 354 close(ar.waitCh) 355 356 ar.Update(upd1) 357 358 // Did not queue the update 359 require.Len(ar.allocUpdatedCh, 0) 360 } 361 362 // TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is 363 // reported for services that got migrated; not just part of deployments. 364 func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) { 365 t.Parallel() 366 367 alloc := mock.Alloc() 368 369 // Ensure the alloc is *not* part of a deployment 370 alloc.DeploymentID = "" 371 372 // Shorten the default migration healthy time 373 tg := alloc.Job.TaskGroups[0] 374 tg.Migrate = structs.DefaultMigrateStrategy() 375 tg.Migrate.MinHealthyTime = 100 * time.Millisecond 376 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 377 378 task := tg.Tasks[0] 379 task.Driver = "mock_driver" 380 task.Config = map[string]interface{}{ 381 "run_for": "30s", 382 } 383 384 conf, cleanup := testAllocRunnerConfig(t, alloc) 385 defer cleanup() 386 387 ar, err := NewAllocRunner(conf) 388 require.NoError(t, err) 389 go ar.Run() 390 defer destroy(ar) 391 392 upd := conf.StateUpdater.(*MockStateUpdater) 393 testutil.WaitForResult(func() (bool, error) { 394 last := upd.Last() 395 if last == nil { 396 return false, fmt.Errorf("No updates") 397 } 398 if !last.DeploymentStatus.HasHealth() { 399 return false, fmt.Errorf("want deployment status unhealthy; got unset") 400 } else if !*last.DeploymentStatus.Healthy { 401 // This is fatal 402 t.Fatal("want deployment status healthy; got unhealthy") 403 } 404 return true, nil 405 }, func(err error) { 406 require.NoError(t, err) 407 }) 408 } 409 410 // TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health 411 // watcher will mark the allocation as healthy based on task states alone. 412 func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { 413 t.Parallel() 414 415 alloc := mock.Alloc() 416 417 task := alloc.Job.TaskGroups[0].Tasks[0] 418 task.Driver = "mock_driver" 419 task.Config = map[string]interface{}{ 420 "run_for": "10s", 421 } 422 423 // Create a task that takes longer to become healthy 424 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy()) 425 alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy() 426 task2 := alloc.Job.TaskGroups[0].Tasks[1] 427 task2.Name = "task2" 428 task2.Config["start_block_for"] = "500ms" 429 430 // Make the alloc be part of a deployment that uses task states for 431 // health checks 432 alloc.DeploymentID = uuid.Generate() 433 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 434 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 435 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 436 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 437 438 conf, cleanup := testAllocRunnerConfig(t, alloc) 439 defer cleanup() 440 441 ar, err := NewAllocRunner(conf) 442 require.NoError(t, err) 443 444 start, done := time.Now(), time.Time{} 445 go ar.Run() 446 defer destroy(ar) 447 448 upd := conf.StateUpdater.(*MockStateUpdater) 449 testutil.WaitForResult(func() (bool, error) { 450 last := upd.Last() 451 if last == nil { 452 return false, fmt.Errorf("No updates") 453 } 454 if !last.DeploymentStatus.HasHealth() { 455 return false, fmt.Errorf("want deployment status unhealthy; got unset") 456 } else if !*last.DeploymentStatus.Healthy { 457 // This is fatal 458 t.Fatal("want deployment status healthy; got unhealthy") 459 } 460 461 // Capture the done timestamp 462 done = last.DeploymentStatus.Timestamp 463 return true, nil 464 }, func(err error) { 465 require.NoError(t, err) 466 }) 467 468 if d := done.Sub(start); d < 500*time.Millisecond { 469 t.Fatalf("didn't wait for second task group. Only took %v", d) 470 } 471 } 472 473 // TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health 474 // watcher will mark the allocation as unhealthy with failing checks. 475 func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { 476 t.Parallel() 477 478 alloc := mock.Alloc() 479 task := alloc.Job.TaskGroups[0].Tasks[0] 480 task.Driver = "mock_driver" 481 task.Config = map[string]interface{}{ 482 "run_for": "10s", 483 } 484 485 // Set a service with check 486 task.Services = []*structs.Service{ 487 { 488 Name: "fakservice", 489 PortLabel: "http", 490 Checks: []*structs.ServiceCheck{ 491 { 492 Name: "fakecheck", 493 Type: structs.ServiceCheckScript, 494 Command: "true", 495 Interval: 30 * time.Second, 496 Timeout: 5 * time.Second, 497 }, 498 }, 499 }, 500 } 501 502 // Make the alloc be part of a deployment 503 alloc.DeploymentID = uuid.Generate() 504 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 505 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 506 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 507 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 508 alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second 509 510 checkUnhealthy := &api.AgentCheck{ 511 CheckID: uuid.Generate(), 512 Status: api.HealthWarning, 513 } 514 515 conf, cleanup := testAllocRunnerConfig(t, alloc) 516 defer cleanup() 517 518 // Only return the check as healthy after a duration 519 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 520 consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 521 return &consul.AllocRegistration{ 522 Tasks: map[string]*consul.TaskRegistration{ 523 task.Name: { 524 Services: map[string]*consul.ServiceRegistration{ 525 "123": { 526 Service: &api.AgentService{Service: "fakeservice"}, 527 Checks: []*api.AgentCheck{checkUnhealthy}, 528 }, 529 }, 530 }, 531 }, 532 }, nil 533 } 534 535 ar, err := NewAllocRunner(conf) 536 require.NoError(t, err) 537 go ar.Run() 538 defer destroy(ar) 539 540 var lastUpdate *structs.Allocation 541 upd := conf.StateUpdater.(*MockStateUpdater) 542 testutil.WaitForResult(func() (bool, error) { 543 lastUpdate = upd.Last() 544 if lastUpdate == nil { 545 return false, fmt.Errorf("No updates") 546 } 547 if !lastUpdate.DeploymentStatus.HasHealth() { 548 return false, fmt.Errorf("want deployment status unhealthy; got unset") 549 } else if *lastUpdate.DeploymentStatus.Healthy { 550 // This is fatal 551 t.Fatal("want deployment status unhealthy; got healthy") 552 } 553 return true, nil 554 }, func(err error) { 555 require.NoError(t, err) 556 }) 557 558 // Assert that we have an event explaining why we are unhealthy. 559 require.Len(t, lastUpdate.TaskStates, 1) 560 state := lastUpdate.TaskStates[task.Name] 561 require.NotNil(t, state) 562 require.NotEmpty(t, state.Events) 563 last := state.Events[len(state.Events)-1] 564 require.Equal(t, allochealth.AllocHealthEventSource, last.Type) 565 require.Contains(t, last.Message, "by deadline") 566 } 567 568 // TestAllocRunner_Destroy asserts that Destroy kills and cleans up a running 569 // alloc. 570 func TestAllocRunner_Destroy(t *testing.T) { 571 t.Parallel() 572 573 // Ensure task takes some time 574 alloc := mock.BatchAlloc() 575 task := alloc.Job.TaskGroups[0].Tasks[0] 576 task.Config["run_for"] = "10s" 577 578 conf, cleanup := testAllocRunnerConfig(t, alloc) 579 defer cleanup() 580 581 // Use a MemDB to assert alloc state gets cleaned up 582 conf.StateDB = state.NewMemDB(conf.Logger) 583 584 ar, err := NewAllocRunner(conf) 585 require.NoError(t, err) 586 go ar.Run() 587 588 // Wait for alloc to be running 589 testutil.WaitForResult(func() (bool, error) { 590 state := ar.AllocState() 591 592 return state.ClientStatus == structs.AllocClientStatusRunning, 593 fmt.Errorf("got client status %v; want running", state.ClientStatus) 594 }, func(err error) { 595 require.NoError(t, err) 596 }) 597 598 // Assert state was stored 599 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 600 require.NoError(t, err) 601 require.NotNil(t, ls) 602 require.NotNil(t, ts) 603 604 // Now destroy 605 ar.Destroy() 606 607 select { 608 case <-ar.DestroyCh(): 609 // Destroyed properly! 610 case <-time.After(10 * time.Second): 611 require.Fail(t, "timed out waiting for alloc to be destroyed") 612 } 613 614 // Assert alloc is dead 615 state := ar.AllocState() 616 require.Equal(t, structs.AllocClientStatusComplete, state.ClientStatus) 617 618 // Assert the state was cleaned 619 ls, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 620 require.NoError(t, err) 621 require.Nil(t, ls) 622 require.Nil(t, ts) 623 624 // Assert the alloc directory was cleaned 625 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 626 require.Fail(t, "alloc dir still exists: %v", ar.allocDir.AllocDir) 627 } else if !os.IsNotExist(err) { 628 require.Failf(t, "expected NotExist error", "found %v", err) 629 } 630 } 631 632 func TestAllocRunner_SimpleRun(t *testing.T) { 633 t.Parallel() 634 635 alloc := mock.BatchAlloc() 636 637 conf, cleanup := testAllocRunnerConfig(t, alloc) 638 defer cleanup() 639 ar, err := NewAllocRunner(conf) 640 require.NoError(t, err) 641 go ar.Run() 642 defer destroy(ar) 643 644 // Wait for alloc to be running 645 testutil.WaitForResult(func() (bool, error) { 646 state := ar.AllocState() 647 648 if state.ClientStatus != structs.AllocClientStatusComplete { 649 return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete) 650 } 651 652 for t, s := range state.TaskStates { 653 if s.FinishedAt.IsZero() { 654 return false, fmt.Errorf("task %q has zero FinishedAt value", t) 655 } 656 } 657 658 return true, nil 659 }, func(err error) { 660 require.NoError(t, err) 661 }) 662 663 } 664 665 // TestAllocRunner_MoveAllocDir asserts that a rescheduled 666 // allocation copies ephemeral disk content from previous alloc run 667 func TestAllocRunner_MoveAllocDir(t *testing.T) { 668 t.Parallel() 669 670 // Step 1: start and run a task 671 alloc := mock.BatchAlloc() 672 conf, cleanup := testAllocRunnerConfig(t, alloc) 673 defer cleanup() 674 ar, err := NewAllocRunner(conf) 675 require.NoError(t, err) 676 ar.Run() 677 defer destroy(ar) 678 679 require.Equal(t, structs.AllocClientStatusComplete, ar.AllocState().ClientStatus) 680 681 // Step 2. Modify its directory 682 task := alloc.Job.TaskGroups[0].Tasks[0] 683 dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file") 684 ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm) 685 taskDir := ar.allocDir.TaskDirs[task.Name] 686 taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file") 687 ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm) 688 689 // Step 3. Start a new alloc 690 alloc2 := mock.BatchAlloc() 691 alloc2.PreviousAllocation = alloc.ID 692 alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true 693 694 conf2, cleanup := testAllocRunnerConfig(t, alloc2) 695 conf2.PrevAllocWatcher, conf2.PrevAllocMigrator = allocwatcher.NewAllocWatcher(allocwatcher.Config{ 696 Alloc: alloc2, 697 PreviousRunner: ar, 698 Logger: conf2.Logger, 699 }) 700 defer cleanup() 701 ar2, err := NewAllocRunner(conf2) 702 require.NoError(t, err) 703 704 ar2.Run() 705 defer destroy(ar2) 706 707 require.Equal(t, structs.AllocClientStatusComplete, ar2.AllocState().ClientStatus) 708 709 // Ensure that data from ar was moved to ar2 710 dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file") 711 fileInfo, _ := os.Stat(dataFile) 712 require.NotNilf(t, fileInfo, "file %q not found", dataFile) 713 714 taskDir = ar2.allocDir.TaskDirs[task.Name] 715 taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file") 716 fileInfo, _ = os.Stat(taskLocalFile) 717 require.NotNilf(t, fileInfo, "file %q not found", dataFile) 718 719 } 720 721 // TestAllocRuner_HandlesArtifactFailure ensures that if one task in a task group is 722 // retrying fetching an artifact, other tasks in the group should be able 723 // to proceed. 724 func TestAllocRunner_HandlesArtifactFailure(t *testing.T) { 725 t.Parallel() 726 727 alloc := mock.BatchAlloc() 728 alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{ 729 Mode: structs.RestartPolicyModeFail, 730 Attempts: 1, 731 Delay: time.Nanosecond, 732 Interval: time.Hour, 733 } 734 735 // Create a new task with a bad artifact 736 badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() 737 badtask.Name = "bad" 738 badtask.Artifacts = []*structs.TaskArtifact{ 739 {GetterSource: "http://127.0.0.1:0/foo/bar/baz"}, 740 } 741 742 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) 743 alloc.AllocatedResources.Tasks["bad"] = &structs.AllocatedTaskResources{ 744 Cpu: structs.AllocatedCpuResources{ 745 CpuShares: 500, 746 }, 747 Memory: structs.AllocatedMemoryResources{ 748 MemoryMB: 256, 749 }, 750 } 751 752 conf, cleanup := testAllocRunnerConfig(t, alloc) 753 defer cleanup() 754 ar, err := NewAllocRunner(conf) 755 require.NoError(t, err) 756 go ar.Run() 757 defer destroy(ar) 758 759 testutil.WaitForResult(func() (bool, error) { 760 state := ar.AllocState() 761 762 switch state.ClientStatus { 763 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed: 764 return true, nil 765 default: 766 return false, fmt.Errorf("got status %v but want terminal", state.ClientStatus) 767 } 768 769 }, func(err error) { 770 require.NoError(t, err) 771 }) 772 773 state := ar.AllocState() 774 require.Equal(t, structs.AllocClientStatusFailed, state.ClientStatus) 775 require.Equal(t, structs.TaskStateDead, state.TaskStates["web"].State) 776 require.True(t, state.TaskStates["web"].Successful()) 777 require.Equal(t, structs.TaskStateDead, state.TaskStates["bad"].State) 778 require.True(t, state.TaskStates["bad"].Failed) 779 } 780 781 // Test that alloc runner kills tasks in task group when another task fails 782 func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { 783 alloc := mock.BatchAlloc() 784 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 785 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 786 787 // Create two tasks in the task group 788 task := alloc.Job.TaskGroups[0].Tasks[0] 789 task.Name = "task1" 790 task.Driver = "mock_driver" 791 task.KillTimeout = 10 * time.Millisecond 792 task.Config = map[string]interface{}{ 793 "run_for": "10s", 794 } 795 796 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 797 task2.Name = "task 2" 798 task2.Driver = "mock_driver" 799 task2.Config = map[string]interface{}{ 800 "start_error": "fail task please", 801 } 802 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 803 alloc.AllocatedResources.Tasks[task.Name] = tr 804 alloc.AllocatedResources.Tasks[task2.Name] = tr 805 806 conf, cleanup := testAllocRunnerConfig(t, alloc) 807 defer cleanup() 808 ar, err := NewAllocRunner(conf) 809 require.NoError(t, err) 810 defer destroy(ar) 811 go ar.Run() 812 upd := conf.StateUpdater.(*MockStateUpdater) 813 814 testutil.WaitForResult(func() (bool, error) { 815 last := upd.Last() 816 if last == nil { 817 return false, fmt.Errorf("No updates") 818 } 819 if last.ClientStatus != structs.AllocClientStatusFailed { 820 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed) 821 } 822 823 // Task One should be killed 824 state1 := last.TaskStates[task.Name] 825 if state1.State != structs.TaskStateDead { 826 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 827 } 828 if len(state1.Events) < 2 { 829 // At least have a received and destroyed 830 return false, fmt.Errorf("Unexpected number of events") 831 } 832 833 found := false 834 for _, e := range state1.Events { 835 if e.Type != structs.TaskSiblingFailed { 836 found = true 837 } 838 } 839 840 if !found { 841 return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed) 842 } 843 844 // Task Two should be failed 845 state2 := last.TaskStates[task2.Name] 846 if state2.State != structs.TaskStateDead { 847 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 848 } 849 if !state2.Failed { 850 return false, fmt.Errorf("task2 should have failed") 851 } 852 853 return true, nil 854 }, func(err error) { 855 require.Fail(t, "err: %v", err) 856 }) 857 } 858 859 // Test that alloc becoming terminal should destroy the alloc runner 860 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { 861 t.Parallel() 862 alloc := mock.BatchAlloc() 863 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 864 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 865 // Ensure task takes some time 866 task := alloc.Job.TaskGroups[0].Tasks[0] 867 task.Driver = "mock_driver" 868 task.Config["run_for"] = "10s" 869 alloc.AllocatedResources.Tasks[task.Name] = tr 870 871 conf, cleanup := testAllocRunnerConfig(t, alloc) 872 defer cleanup() 873 ar, err := NewAllocRunner(conf) 874 require.NoError(t, err) 875 defer destroy(ar) 876 go ar.Run() 877 upd := conf.StateUpdater.(*MockStateUpdater) 878 879 testutil.WaitForResult(func() (bool, error) { 880 last := upd.Last() 881 if last == nil { 882 return false, fmt.Errorf("No updates") 883 } 884 if last.ClientStatus != structs.AllocClientStatusRunning { 885 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 886 } 887 return true, nil 888 }, func(err error) { 889 require.Fail(t, "err: %v", err) 890 }) 891 892 // Update the alloc to be terminal which should cause the alloc runner to 893 // stop the tasks and wait for a destroy. 894 update := ar.alloc.Copy() 895 update.DesiredStatus = structs.AllocDesiredStatusStop 896 ar.Update(update) 897 898 testutil.WaitForResult(func() (bool, error) { 899 last := upd.Last() 900 if last == nil { 901 return false, fmt.Errorf("No updates") 902 } 903 904 // Check the status has changed. 905 if last.ClientStatus != structs.AllocClientStatusComplete { 906 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 907 } 908 909 // Check the alloc directory still exists 910 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 911 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 912 } 913 914 return true, nil 915 }, func(err error) { 916 require.Fail(t, "err: %v", err) 917 }) 918 919 // Send the destroy signal and ensure the AllocRunner cleans up. 920 ar.Destroy() 921 922 testutil.WaitForResult(func() (bool, error) { 923 last := upd.Last() 924 if last == nil { 925 return false, fmt.Errorf("No updates") 926 } 927 928 // Check the status has changed. 929 if last.ClientStatus != structs.AllocClientStatusComplete { 930 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 931 } 932 933 // Check the alloc directory was cleaned 934 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 935 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 936 } else if !os.IsNotExist(err) { 937 return false, fmt.Errorf("stat err: %v", err) 938 } 939 940 return true, nil 941 }, func(err error) { 942 require.Fail(t, "err: %v", err) 943 }) 944 }