github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/alloc_runner_test.go (about) 1 package allocrunner 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "path/filepath" 8 "testing" 9 "time" 10 11 "github.com/hashicorp/consul/api" 12 "github.com/hashicorp/nomad/client/allochealth" 13 "github.com/hashicorp/nomad/client/allocwatcher" 14 cconsul "github.com/hashicorp/nomad/client/consul" 15 "github.com/hashicorp/nomad/client/state" 16 "github.com/hashicorp/nomad/command/agent/consul" 17 "github.com/hashicorp/nomad/helper/uuid" 18 "github.com/hashicorp/nomad/nomad/mock" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/nomad/testutil" 21 "github.com/stretchr/testify/require" 22 ) 23 24 // destroy does a blocking destroy on an alloc runner 25 func destroy(ar *allocRunner) { 26 ar.Destroy() 27 <-ar.DestroyCh() 28 } 29 30 // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via 31 // AllocState() are initialized even before the AllocRunner has run. 32 func TestAllocRunner_AllocState_Initialized(t *testing.T) { 33 t.Parallel() 34 35 alloc := mock.Alloc() 36 alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 37 conf, cleanup := testAllocRunnerConfig(t, alloc) 38 defer cleanup() 39 40 ar, err := NewAllocRunner(conf) 41 require.NoError(t, err) 42 43 allocState := ar.AllocState() 44 45 require.NotNil(t, allocState) 46 require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name]) 47 } 48 49 // TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the 50 // entire task group is killed. 51 func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { 52 t.Parallel() 53 54 alloc := mock.BatchAlloc() 55 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 56 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 57 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 58 59 // Create two tasks in the task group 60 task := alloc.Job.TaskGroups[0].Tasks[0] 61 task.Name = "task1" 62 task.Driver = "mock_driver" 63 task.KillTimeout = 10 * time.Millisecond 64 task.Config = map[string]interface{}{ 65 "run_for": "10s", 66 } 67 68 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 69 task2.Name = "task2" 70 task2.Driver = "mock_driver" 71 task2.Leader = true 72 task2.Config = map[string]interface{}{ 73 "run_for": "1s", 74 } 75 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 76 alloc.AllocatedResources.Tasks[task.Name] = tr 77 alloc.AllocatedResources.Tasks[task2.Name] = tr 78 79 conf, cleanup := testAllocRunnerConfig(t, alloc) 80 defer cleanup() 81 ar, err := NewAllocRunner(conf) 82 require.NoError(t, err) 83 defer destroy(ar) 84 go ar.Run() 85 86 // Wait for all tasks to be killed 87 upd := conf.StateUpdater.(*MockStateUpdater) 88 testutil.WaitForResult(func() (bool, error) { 89 last := upd.Last() 90 if last == nil { 91 return false, fmt.Errorf("No updates") 92 } 93 if last.ClientStatus != structs.AllocClientStatusComplete { 94 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 95 } 96 97 // Task1 should be killed because Task2 exited 98 state1 := last.TaskStates[task.Name] 99 if state1.State != structs.TaskStateDead { 100 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 101 } 102 if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { 103 return false, fmt.Errorf("expected to have a start and finish time") 104 } 105 if len(state1.Events) < 2 { 106 // At least have a received and destroyed 107 return false, fmt.Errorf("Unexpected number of events") 108 } 109 110 found := false 111 killingMsg := "" 112 for _, e := range state1.Events { 113 if e.Type != structs.TaskLeaderDead { 114 found = true 115 } 116 if e.Type == structs.TaskKilling { 117 killingMsg = e.DisplayMessage 118 } 119 } 120 121 if !found { 122 return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead) 123 } 124 125 expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing" 126 if killingMsg != expectedKillingMsg { 127 return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg) 128 } 129 130 // Task Two should be dead 131 state2 := last.TaskStates[task2.Name] 132 if state2.State != structs.TaskStateDead { 133 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 134 } 135 if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { 136 return false, fmt.Errorf("expected to have a start and finish time") 137 } 138 139 return true, nil 140 }, func(err error) { 141 t.Fatalf("err: %v", err) 142 }) 143 } 144 145 func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) { 146 t.Parallel() 147 148 alloc := mock.Alloc() 149 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 150 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 151 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 152 153 // Create a group service 154 tg := alloc.Job.TaskGroups[0] 155 tg.Services = []*structs.Service{ 156 { 157 Name: "shutdown_service", 158 }, 159 } 160 161 // Create two tasks in the group 162 task := alloc.Job.TaskGroups[0].Tasks[0] 163 task.Name = "follower1" 164 task.Driver = "mock_driver" 165 task.Config = map[string]interface{}{ 166 "run_for": "10s", 167 } 168 169 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 170 task2.Name = "leader" 171 task2.Driver = "mock_driver" 172 task2.Leader = true 173 task2.Config = map[string]interface{}{ 174 "run_for": "10s", 175 } 176 177 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 178 alloc.AllocatedResources.Tasks[task.Name] = tr 179 alloc.AllocatedResources.Tasks[task2.Name] = tr 180 181 // Set a shutdown delay 182 shutdownDelay := 1 * time.Second 183 alloc.Job.TaskGroups[0].ShutdownDelay = &shutdownDelay 184 185 conf, cleanup := testAllocRunnerConfig(t, alloc) 186 defer cleanup() 187 ar, err := NewAllocRunner(conf) 188 require.NoError(t, err) 189 defer destroy(ar) 190 go ar.Run() 191 192 // Wait for tasks to start 193 upd := conf.StateUpdater.(*MockStateUpdater) 194 last := upd.Last() 195 testutil.WaitForResult(func() (bool, error) { 196 last = upd.Last() 197 if last == nil { 198 return false, fmt.Errorf("No updates") 199 } 200 if n := len(last.TaskStates); n != 2 { 201 return false, fmt.Errorf("Not enough task states (want: 2; found %d)", n) 202 } 203 for name, state := range last.TaskStates { 204 if state.State != structs.TaskStateRunning { 205 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 206 } 207 } 208 return true, nil 209 }, func(err error) { 210 t.Fatalf("err: %v", err) 211 }) 212 213 // Reset updates 214 upd.Reset() 215 216 // Stop alloc 217 shutdownInit := time.Now() 218 update := alloc.Copy() 219 update.DesiredStatus = structs.AllocDesiredStatusStop 220 ar.Update(update) 221 222 // Wait for tasks to stop 223 testutil.WaitForResult(func() (bool, error) { 224 last := upd.Last() 225 if last == nil { 226 return false, fmt.Errorf("No updates") 227 } 228 229 fin := last.TaskStates["leader"].FinishedAt 230 231 if fin.IsZero() { 232 return false, nil 233 } 234 235 return true, nil 236 }, func(err error) { 237 last := upd.Last() 238 for name, state := range last.TaskStates { 239 t.Logf("%s: %s", name, state.State) 240 } 241 t.Fatalf("err: %v", err) 242 }) 243 244 // Get consul client operations 245 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 246 consulOpts := consulClient.GetOps() 247 var groupRemoveOp cconsul.MockConsulOp 248 for _, op := range consulOpts { 249 // Grab the first deregistration request 250 if op.Op == "remove" && op.Name == "group-web" { 251 groupRemoveOp = op 252 break 253 } 254 } 255 256 // Ensure remove operation is close to shutdown initiation 257 require.True(t, groupRemoveOp.OccurredAt.Sub(shutdownInit) < 100*time.Millisecond) 258 259 last = upd.Last() 260 minShutdown := shutdownInit.Add(task.ShutdownDelay) 261 leaderFinished := last.TaskStates["leader"].FinishedAt 262 followerFinished := last.TaskStates["follower1"].FinishedAt 263 264 // Check that both tasks shut down after min possible shutdown time 265 require.Greater(t, leaderFinished.UnixNano(), minShutdown.UnixNano()) 266 require.Greater(t, followerFinished.UnixNano(), minShutdown.UnixNano()) 267 268 // Check that there is at least shutdown_delay between consul 269 // remove operation and task finished at time 270 require.True(t, leaderFinished.Sub(groupRemoveOp.OccurredAt) > shutdownDelay) 271 } 272 273 // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a 274 // leader the leader is stopped before other tasks. 275 func TestAllocRunner_TaskLeader_StopTG(t *testing.T) { 276 t.Parallel() 277 278 alloc := mock.Alloc() 279 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 280 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 281 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 282 283 // Create 3 tasks in the task group 284 task := alloc.Job.TaskGroups[0].Tasks[0] 285 task.Name = "follower1" 286 task.Driver = "mock_driver" 287 task.Config = map[string]interface{}{ 288 "run_for": "10s", 289 } 290 291 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 292 task2.Name = "leader" 293 task2.Driver = "mock_driver" 294 task2.Leader = true 295 task2.Config = map[string]interface{}{ 296 "run_for": "10s", 297 } 298 299 task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 300 task3.Name = "follower2" 301 task3.Driver = "mock_driver" 302 task3.Config = map[string]interface{}{ 303 "run_for": "10s", 304 } 305 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3) 306 alloc.AllocatedResources.Tasks[task.Name] = tr 307 alloc.AllocatedResources.Tasks[task2.Name] = tr 308 alloc.AllocatedResources.Tasks[task3.Name] = tr 309 310 conf, cleanup := testAllocRunnerConfig(t, alloc) 311 defer cleanup() 312 ar, err := NewAllocRunner(conf) 313 require.NoError(t, err) 314 defer destroy(ar) 315 go ar.Run() 316 317 // Wait for tasks to start 318 upd := conf.StateUpdater.(*MockStateUpdater) 319 last := upd.Last() 320 testutil.WaitForResult(func() (bool, error) { 321 last = upd.Last() 322 if last == nil { 323 return false, fmt.Errorf("No updates") 324 } 325 if n := len(last.TaskStates); n != 3 { 326 return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n) 327 } 328 for name, state := range last.TaskStates { 329 if state.State != structs.TaskStateRunning { 330 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 331 } 332 } 333 return true, nil 334 }, func(err error) { 335 t.Fatalf("err: %v", err) 336 }) 337 338 // Reset updates 339 upd.Reset() 340 341 // Stop alloc 342 update := alloc.Copy() 343 update.DesiredStatus = structs.AllocDesiredStatusStop 344 ar.Update(update) 345 346 // Wait for tasks to stop 347 testutil.WaitForResult(func() (bool, error) { 348 last := upd.Last() 349 if last == nil { 350 return false, fmt.Errorf("No updates") 351 } 352 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() { 353 return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s", 354 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt) 355 } 356 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() { 357 return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s", 358 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt) 359 } 360 return true, nil 361 }, func(err error) { 362 last := upd.Last() 363 for name, state := range last.TaskStates { 364 t.Logf("%s: %s", name, state.State) 365 } 366 t.Fatalf("err: %v", err) 367 }) 368 } 369 370 // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a 371 // restored task group with a leader that failed before restoring the leader is 372 // not stopped as it does not exist. 373 // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932 374 func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) { 375 t.Parallel() 376 377 alloc := mock.Alloc() 378 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 379 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 380 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 381 382 // Create a leader and follower task in the task group 383 task := alloc.Job.TaskGroups[0].Tasks[0] 384 task.Name = "follower1" 385 task.Driver = "mock_driver" 386 task.KillTimeout = 10 * time.Second 387 task.Config = map[string]interface{}{ 388 "run_for": "10s", 389 } 390 391 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 392 task2.Name = "leader" 393 task2.Driver = "mock_driver" 394 task2.Leader = true 395 task2.KillTimeout = 10 * time.Millisecond 396 task2.Config = map[string]interface{}{ 397 "run_for": "10s", 398 } 399 400 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 401 alloc.AllocatedResources.Tasks[task.Name] = tr 402 alloc.AllocatedResources.Tasks[task2.Name] = tr 403 404 conf, cleanup := testAllocRunnerConfig(t, alloc) 405 defer cleanup() 406 407 // Use a memory backed statedb 408 conf.StateDB = state.NewMemDB(conf.Logger) 409 410 ar, err := NewAllocRunner(conf) 411 require.NoError(t, err) 412 413 // Mimic Nomad exiting before the leader stopping is able to stop other tasks. 414 ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) 415 ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 416 417 // Create a new AllocRunner to test RestoreState and Run 418 ar2, err := NewAllocRunner(conf) 419 require.NoError(t, err) 420 defer destroy(ar2) 421 422 if err := ar2.Restore(); err != nil { 423 t.Fatalf("error restoring state: %v", err) 424 } 425 ar2.Run() 426 427 // Wait for tasks to be stopped because leader is dead 428 testutil.WaitForResult(func() (bool, error) { 429 alloc := ar2.Alloc() 430 // TODO: this test does not test anything!!! alloc.TaskStates is an empty map 431 for task, state := range alloc.TaskStates { 432 if state.State != structs.TaskStateDead { 433 return false, fmt.Errorf("Task %q should be dead: %v", task, state.State) 434 } 435 } 436 return true, nil 437 }, func(err error) { 438 t.Fatalf("err: %v", err) 439 }) 440 441 // Make sure it GCs properly 442 ar2.Destroy() 443 444 select { 445 case <-ar2.DestroyCh(): 446 // exited as expected 447 case <-time.After(10 * time.Second): 448 t.Fatalf("timed out waiting for AR to GC") 449 } 450 } 451 452 func TestAllocRunner_Restore_LifecycleHooks(t *testing.T) { 453 t.Parallel() 454 455 alloc := mock.LifecycleAlloc() 456 457 conf, cleanup := testAllocRunnerConfig(t, alloc) 458 defer cleanup() 459 460 // Use a memory backed statedb 461 conf.StateDB = state.NewMemDB(conf.Logger) 462 463 ar, err := NewAllocRunner(conf) 464 require.NoError(t, err) 465 466 // We should see all tasks with Prestart hooks are not blocked from running: 467 // i.e. the "init" and "side" task hook coordinator channels are closed 468 require.Truef(t, isChannelClosed(ar.taskHookCoordinator.startConditionForTask(ar.tasks["init"].Task())), "init channel was open, should be closed") 469 require.Truef(t, isChannelClosed(ar.taskHookCoordinator.startConditionForTask(ar.tasks["side"].Task())), "side channel was open, should be closed") 470 471 isChannelClosed(ar.taskHookCoordinator.startConditionForTask(ar.tasks["side"].Task())) 472 473 // Mimic client dies while init task running, and client restarts after init task finished 474 ar.tasks["init"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskTerminated)) 475 ar.tasks["side"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 476 477 // Create a new AllocRunner to test RestoreState and Run 478 ar2, err := NewAllocRunner(conf) 479 require.NoError(t, err) 480 481 if err := ar2.Restore(); err != nil { 482 t.Fatalf("error restoring state: %v", err) 483 } 484 485 // We want to see Restore resume execution with correct hook ordering: 486 // i.e. we should see the "web" main task hook coordinator channel is closed 487 require.Truef(t, isChannelClosed(ar2.taskHookCoordinator.startConditionForTask(ar.tasks["web"].Task())), "web channel was open, should be closed") 488 } 489 490 func TestAllocRunner_Update_Semantics(t *testing.T) { 491 t.Parallel() 492 require := require.New(t) 493 494 updatedAlloc := func(a *structs.Allocation) *structs.Allocation { 495 upd := a.CopySkipJob() 496 upd.AllocModifyIndex++ 497 498 return upd 499 } 500 501 alloc := mock.Alloc() 502 alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 503 conf, cleanup := testAllocRunnerConfig(t, alloc) 504 defer cleanup() 505 506 ar, err := NewAllocRunner(conf) 507 require.NoError(err) 508 509 upd1 := updatedAlloc(alloc) 510 ar.Update(upd1) 511 512 // Update was placed into a queue 513 require.Len(ar.allocUpdatedCh, 1) 514 515 upd2 := updatedAlloc(alloc) 516 ar.Update(upd2) 517 518 // Allocation was _replaced_ 519 520 require.Len(ar.allocUpdatedCh, 1) 521 queuedAlloc := <-ar.allocUpdatedCh 522 require.Equal(upd2, queuedAlloc) 523 524 // Requeueing older alloc is skipped 525 ar.Update(upd2) 526 ar.Update(upd1) 527 528 queuedAlloc = <-ar.allocUpdatedCh 529 require.Equal(upd2, queuedAlloc) 530 531 // Ignore after watch closed 532 533 close(ar.waitCh) 534 535 ar.Update(upd1) 536 537 // Did not queue the update 538 require.Len(ar.allocUpdatedCh, 0) 539 } 540 541 // TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is 542 // reported for services that got migrated; not just part of deployments. 543 func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) { 544 t.Parallel() 545 546 alloc := mock.Alloc() 547 548 // Ensure the alloc is *not* part of a deployment 549 alloc.DeploymentID = "" 550 551 // Shorten the default migration healthy time 552 tg := alloc.Job.TaskGroups[0] 553 tg.Migrate = structs.DefaultMigrateStrategy() 554 tg.Migrate.MinHealthyTime = 100 * time.Millisecond 555 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 556 557 task := tg.Tasks[0] 558 task.Driver = "mock_driver" 559 task.Config = map[string]interface{}{ 560 "run_for": "30s", 561 } 562 563 conf, cleanup := testAllocRunnerConfig(t, alloc) 564 defer cleanup() 565 566 ar, err := NewAllocRunner(conf) 567 require.NoError(t, err) 568 go ar.Run() 569 defer destroy(ar) 570 571 upd := conf.StateUpdater.(*MockStateUpdater) 572 testutil.WaitForResult(func() (bool, error) { 573 last := upd.Last() 574 if last == nil { 575 return false, fmt.Errorf("No updates") 576 } 577 if !last.DeploymentStatus.HasHealth() { 578 return false, fmt.Errorf("want deployment status unhealthy; got unset") 579 } else if !*last.DeploymentStatus.Healthy { 580 // This is fatal 581 t.Fatal("want deployment status healthy; got unhealthy") 582 } 583 return true, nil 584 }, func(err error) { 585 require.NoError(t, err) 586 }) 587 } 588 589 // TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health 590 // watcher will mark the allocation as healthy based on task states alone. 591 func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { 592 t.Parallel() 593 594 alloc := mock.Alloc() 595 596 task := alloc.Job.TaskGroups[0].Tasks[0] 597 task.Driver = "mock_driver" 598 task.Config = map[string]interface{}{ 599 "run_for": "10s", 600 } 601 602 // Create a task that takes longer to become healthy 603 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy()) 604 alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy() 605 task2 := alloc.Job.TaskGroups[0].Tasks[1] 606 task2.Name = "task2" 607 task2.Config["start_block_for"] = "500ms" 608 609 // Make the alloc be part of a deployment that uses task states for 610 // health checks 611 alloc.DeploymentID = uuid.Generate() 612 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 613 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 614 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 615 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 616 617 conf, cleanup := testAllocRunnerConfig(t, alloc) 618 defer cleanup() 619 620 ar, err := NewAllocRunner(conf) 621 require.NoError(t, err) 622 623 start, done := time.Now(), time.Time{} 624 go ar.Run() 625 defer destroy(ar) 626 627 upd := conf.StateUpdater.(*MockStateUpdater) 628 testutil.WaitForResult(func() (bool, error) { 629 last := upd.Last() 630 if last == nil { 631 return false, fmt.Errorf("No updates") 632 } 633 if !last.DeploymentStatus.HasHealth() { 634 return false, fmt.Errorf("want deployment status unhealthy; got unset") 635 } else if !*last.DeploymentStatus.Healthy { 636 // This is fatal 637 t.Fatal("want deployment status healthy; got unhealthy") 638 } 639 640 // Capture the done timestamp 641 done = last.DeploymentStatus.Timestamp 642 return true, nil 643 }, func(err error) { 644 require.NoError(t, err) 645 }) 646 647 if d := done.Sub(start); d < 500*time.Millisecond { 648 t.Fatalf("didn't wait for second task group. Only took %v", d) 649 } 650 } 651 652 // TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health 653 // watcher will mark the allocation as unhealthy with failing checks. 654 func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { 655 t.Parallel() 656 657 alloc := mock.Alloc() 658 task := alloc.Job.TaskGroups[0].Tasks[0] 659 task.Driver = "mock_driver" 660 task.Config = map[string]interface{}{ 661 "run_for": "10s", 662 } 663 664 // Set a service with check 665 task.Services = []*structs.Service{ 666 { 667 Name: "fakservice", 668 PortLabel: "http", 669 Checks: []*structs.ServiceCheck{ 670 { 671 Name: "fakecheck", 672 Type: structs.ServiceCheckScript, 673 Command: "true", 674 Interval: 30 * time.Second, 675 Timeout: 5 * time.Second, 676 }, 677 }, 678 }, 679 } 680 681 // Make the alloc be part of a deployment 682 alloc.DeploymentID = uuid.Generate() 683 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 684 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 685 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 686 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 687 alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second 688 689 checkUnhealthy := &api.AgentCheck{ 690 CheckID: uuid.Generate(), 691 Status: api.HealthWarning, 692 } 693 694 conf, cleanup := testAllocRunnerConfig(t, alloc) 695 defer cleanup() 696 697 // Only return the check as healthy after a duration 698 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 699 consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 700 return &consul.AllocRegistration{ 701 Tasks: map[string]*consul.ServiceRegistrations{ 702 task.Name: { 703 Services: map[string]*consul.ServiceRegistration{ 704 "123": { 705 Service: &api.AgentService{Service: "fakeservice"}, 706 Checks: []*api.AgentCheck{checkUnhealthy}, 707 }, 708 }, 709 }, 710 }, 711 }, nil 712 } 713 714 ar, err := NewAllocRunner(conf) 715 require.NoError(t, err) 716 go ar.Run() 717 defer destroy(ar) 718 719 var lastUpdate *structs.Allocation 720 upd := conf.StateUpdater.(*MockStateUpdater) 721 testutil.WaitForResult(func() (bool, error) { 722 lastUpdate = upd.Last() 723 if lastUpdate == nil { 724 return false, fmt.Errorf("No updates") 725 } 726 if !lastUpdate.DeploymentStatus.HasHealth() { 727 return false, fmt.Errorf("want deployment status unhealthy; got unset") 728 } else if *lastUpdate.DeploymentStatus.Healthy { 729 // This is fatal 730 t.Fatal("want deployment status unhealthy; got healthy") 731 } 732 return true, nil 733 }, func(err error) { 734 require.NoError(t, err) 735 }) 736 737 // Assert that we have an event explaining why we are unhealthy. 738 require.Len(t, lastUpdate.TaskStates, 1) 739 state := lastUpdate.TaskStates[task.Name] 740 require.NotNil(t, state) 741 require.NotEmpty(t, state.Events) 742 last := state.Events[len(state.Events)-1] 743 require.Equal(t, allochealth.AllocHealthEventSource, last.Type) 744 require.Contains(t, last.Message, "by deadline") 745 } 746 747 // TestAllocRunner_Destroy asserts that Destroy kills and cleans up a running 748 // alloc. 749 func TestAllocRunner_Destroy(t *testing.T) { 750 t.Parallel() 751 752 // Ensure task takes some time 753 alloc := mock.BatchAlloc() 754 task := alloc.Job.TaskGroups[0].Tasks[0] 755 task.Config["run_for"] = "10s" 756 757 conf, cleanup := testAllocRunnerConfig(t, alloc) 758 defer cleanup() 759 760 // Use a MemDB to assert alloc state gets cleaned up 761 conf.StateDB = state.NewMemDB(conf.Logger) 762 763 ar, err := NewAllocRunner(conf) 764 require.NoError(t, err) 765 go ar.Run() 766 767 // Wait for alloc to be running 768 testutil.WaitForResult(func() (bool, error) { 769 state := ar.AllocState() 770 771 return state.ClientStatus == structs.AllocClientStatusRunning, 772 fmt.Errorf("got client status %v; want running", state.ClientStatus) 773 }, func(err error) { 774 require.NoError(t, err) 775 }) 776 777 // Assert state was stored 778 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 779 require.NoError(t, err) 780 require.NotNil(t, ls) 781 require.NotNil(t, ts) 782 783 // Now destroy 784 ar.Destroy() 785 786 select { 787 case <-ar.DestroyCh(): 788 // Destroyed properly! 789 case <-time.After(10 * time.Second): 790 require.Fail(t, "timed out waiting for alloc to be destroyed") 791 } 792 793 // Assert alloc is dead 794 state := ar.AllocState() 795 require.Equal(t, structs.AllocClientStatusComplete, state.ClientStatus) 796 797 // Assert the state was cleaned 798 ls, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 799 require.NoError(t, err) 800 require.Nil(t, ls) 801 require.Nil(t, ts) 802 803 // Assert the alloc directory was cleaned 804 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 805 require.Fail(t, "alloc dir still exists: %v", ar.allocDir.AllocDir) 806 } else if !os.IsNotExist(err) { 807 require.Failf(t, "expected NotExist error", "found %v", err) 808 } 809 } 810 811 func TestAllocRunner_SimpleRun(t *testing.T) { 812 t.Parallel() 813 814 alloc := mock.BatchAlloc() 815 816 conf, cleanup := testAllocRunnerConfig(t, alloc) 817 defer cleanup() 818 ar, err := NewAllocRunner(conf) 819 require.NoError(t, err) 820 go ar.Run() 821 defer destroy(ar) 822 823 // Wait for alloc to be running 824 testutil.WaitForResult(func() (bool, error) { 825 state := ar.AllocState() 826 827 if state.ClientStatus != structs.AllocClientStatusComplete { 828 return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete) 829 } 830 831 for t, s := range state.TaskStates { 832 if s.FinishedAt.IsZero() { 833 return false, fmt.Errorf("task %q has zero FinishedAt value", t) 834 } 835 } 836 837 return true, nil 838 }, func(err error) { 839 require.NoError(t, err) 840 }) 841 842 } 843 844 // TestAllocRunner_MoveAllocDir asserts that a rescheduled 845 // allocation copies ephemeral disk content from previous alloc run 846 func TestAllocRunner_MoveAllocDir(t *testing.T) { 847 t.Parallel() 848 849 // Step 1: start and run a task 850 alloc := mock.BatchAlloc() 851 conf, cleanup := testAllocRunnerConfig(t, alloc) 852 defer cleanup() 853 ar, err := NewAllocRunner(conf) 854 require.NoError(t, err) 855 ar.Run() 856 defer destroy(ar) 857 858 require.Equal(t, structs.AllocClientStatusComplete, ar.AllocState().ClientStatus) 859 860 // Step 2. Modify its directory 861 task := alloc.Job.TaskGroups[0].Tasks[0] 862 dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file") 863 ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm) 864 taskDir := ar.allocDir.TaskDirs[task.Name] 865 taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file") 866 ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm) 867 868 // Step 3. Start a new alloc 869 alloc2 := mock.BatchAlloc() 870 alloc2.PreviousAllocation = alloc.ID 871 alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true 872 873 conf2, cleanup := testAllocRunnerConfig(t, alloc2) 874 conf2.PrevAllocWatcher, conf2.PrevAllocMigrator = allocwatcher.NewAllocWatcher(allocwatcher.Config{ 875 Alloc: alloc2, 876 PreviousRunner: ar, 877 Logger: conf2.Logger, 878 }) 879 defer cleanup() 880 ar2, err := NewAllocRunner(conf2) 881 require.NoError(t, err) 882 883 ar2.Run() 884 defer destroy(ar2) 885 886 require.Equal(t, structs.AllocClientStatusComplete, ar2.AllocState().ClientStatus) 887 888 // Ensure that data from ar was moved to ar2 889 dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file") 890 fileInfo, _ := os.Stat(dataFile) 891 require.NotNilf(t, fileInfo, "file %q not found", dataFile) 892 893 taskDir = ar2.allocDir.TaskDirs[task.Name] 894 taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file") 895 fileInfo, _ = os.Stat(taskLocalFile) 896 require.NotNilf(t, fileInfo, "file %q not found", dataFile) 897 898 } 899 900 // TestAllocRuner_HandlesArtifactFailure ensures that if one task in a task group is 901 // retrying fetching an artifact, other tasks in the group should be able 902 // to proceed. 903 func TestAllocRunner_HandlesArtifactFailure(t *testing.T) { 904 t.Parallel() 905 906 alloc := mock.BatchAlloc() 907 rp := &structs.RestartPolicy{ 908 Mode: structs.RestartPolicyModeFail, 909 Attempts: 1, 910 Delay: time.Nanosecond, 911 Interval: time.Hour, 912 } 913 alloc.Job.TaskGroups[0].RestartPolicy = rp 914 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp 915 916 // Create a new task with a bad artifact 917 badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() 918 badtask.Name = "bad" 919 badtask.Artifacts = []*structs.TaskArtifact{ 920 {GetterSource: "http://127.0.0.1:0/foo/bar/baz"}, 921 } 922 923 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) 924 alloc.AllocatedResources.Tasks["bad"] = &structs.AllocatedTaskResources{ 925 Cpu: structs.AllocatedCpuResources{ 926 CpuShares: 500, 927 }, 928 Memory: structs.AllocatedMemoryResources{ 929 MemoryMB: 256, 930 }, 931 } 932 933 conf, cleanup := testAllocRunnerConfig(t, alloc) 934 defer cleanup() 935 ar, err := NewAllocRunner(conf) 936 require.NoError(t, err) 937 go ar.Run() 938 defer destroy(ar) 939 940 testutil.WaitForResult(func() (bool, error) { 941 state := ar.AllocState() 942 943 switch state.ClientStatus { 944 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed: 945 return true, nil 946 default: 947 return false, fmt.Errorf("got status %v but want terminal", state.ClientStatus) 948 } 949 950 }, func(err error) { 951 require.NoError(t, err) 952 }) 953 954 state := ar.AllocState() 955 require.Equal(t, structs.AllocClientStatusFailed, state.ClientStatus) 956 require.Equal(t, structs.TaskStateDead, state.TaskStates["web"].State) 957 require.True(t, state.TaskStates["web"].Successful()) 958 require.Equal(t, structs.TaskStateDead, state.TaskStates["bad"].State) 959 require.True(t, state.TaskStates["bad"].Failed) 960 } 961 962 // Test that alloc runner kills tasks in task group when another task fails 963 func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { 964 alloc := mock.Alloc() 965 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 966 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 967 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 968 969 // Create two tasks in the task group 970 task := alloc.Job.TaskGroups[0].Tasks[0] 971 task.Name = "task1" 972 task.Driver = "mock_driver" 973 task.KillTimeout = 10 * time.Millisecond 974 task.Config = map[string]interface{}{ 975 "run_for": "10s", 976 } 977 // Set a service with check 978 task.Services = []*structs.Service{ 979 { 980 Name: "fakservice", 981 PortLabel: "http", 982 Checks: []*structs.ServiceCheck{ 983 { 984 Name: "fakecheck", 985 Type: structs.ServiceCheckScript, 986 Command: "true", 987 Interval: 30 * time.Second, 988 Timeout: 5 * time.Second, 989 }, 990 }, 991 }, 992 } 993 994 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 995 task2.Name = "task 2" 996 task2.Driver = "mock_driver" 997 task2.Config = map[string]interface{}{ 998 "start_error": "fail task please", 999 } 1000 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 1001 alloc.AllocatedResources.Tasks[task.Name] = tr 1002 alloc.AllocatedResources.Tasks[task2.Name] = tr 1003 1004 // Make the alloc be part of a deployment 1005 alloc.DeploymentID = uuid.Generate() 1006 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 1007 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 1008 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 1009 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 10 * time.Millisecond 1010 alloc.Job.TaskGroups[0].Update.HealthyDeadline = 2 * time.Second 1011 1012 checkHealthy := &api.AgentCheck{ 1013 CheckID: uuid.Generate(), 1014 Status: api.HealthPassing, 1015 } 1016 1017 conf, cleanup := testAllocRunnerConfig(t, alloc) 1018 defer cleanup() 1019 1020 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 1021 consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 1022 return &consul.AllocRegistration{ 1023 Tasks: map[string]*consul.ServiceRegistrations{ 1024 task.Name: { 1025 Services: map[string]*consul.ServiceRegistration{ 1026 "123": { 1027 Service: &api.AgentService{Service: "fakeservice"}, 1028 Checks: []*api.AgentCheck{checkHealthy}, 1029 }, 1030 }, 1031 }, 1032 }, 1033 }, nil 1034 } 1035 1036 ar, err := NewAllocRunner(conf) 1037 require.NoError(t, err) 1038 defer destroy(ar) 1039 go ar.Run() 1040 upd := conf.StateUpdater.(*MockStateUpdater) 1041 1042 testutil.WaitForResult(func() (bool, error) { 1043 last := upd.Last() 1044 if last == nil { 1045 return false, fmt.Errorf("No updates") 1046 } 1047 if last.ClientStatus != structs.AllocClientStatusFailed { 1048 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed) 1049 } 1050 1051 // Task One should be killed 1052 state1 := last.TaskStates[task.Name] 1053 if state1.State != structs.TaskStateDead { 1054 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1055 } 1056 if len(state1.Events) < 2 { 1057 // At least have a received and destroyed 1058 return false, fmt.Errorf("Unexpected number of events") 1059 } 1060 1061 found := false 1062 for _, e := range state1.Events { 1063 if e.Type != structs.TaskSiblingFailed { 1064 found = true 1065 } 1066 } 1067 1068 if !found { 1069 return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed) 1070 } 1071 1072 // Task Two should be failed 1073 state2 := last.TaskStates[task2.Name] 1074 if state2.State != structs.TaskStateDead { 1075 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1076 } 1077 if !state2.Failed { 1078 return false, fmt.Errorf("task2 should have failed") 1079 } 1080 1081 if !last.DeploymentStatus.HasHealth() { 1082 return false, fmt.Errorf("Expected deployment health to be non nil") 1083 } 1084 1085 return true, nil 1086 }, func(err error) { 1087 require.Fail(t, "err: %v", err) 1088 }) 1089 } 1090 1091 // Test that alloc becoming terminal should destroy the alloc runner 1092 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { 1093 t.Parallel() 1094 alloc := mock.BatchAlloc() 1095 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 1096 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 1097 alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 1098 // Ensure task takes some time 1099 task := alloc.Job.TaskGroups[0].Tasks[0] 1100 task.Driver = "mock_driver" 1101 task.Config["run_for"] = "10s" 1102 alloc.AllocatedResources.Tasks[task.Name] = tr 1103 1104 conf, cleanup := testAllocRunnerConfig(t, alloc) 1105 defer cleanup() 1106 ar, err := NewAllocRunner(conf) 1107 require.NoError(t, err) 1108 defer destroy(ar) 1109 go ar.Run() 1110 upd := conf.StateUpdater.(*MockStateUpdater) 1111 1112 testutil.WaitForResult(func() (bool, error) { 1113 last := upd.Last() 1114 if last == nil { 1115 return false, fmt.Errorf("No updates") 1116 } 1117 if last.ClientStatus != structs.AllocClientStatusRunning { 1118 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 1119 } 1120 return true, nil 1121 }, func(err error) { 1122 require.Fail(t, "err: %v", err) 1123 }) 1124 1125 // Update the alloc to be terminal which should cause the alloc runner to 1126 // stop the tasks and wait for a destroy. 1127 update := ar.alloc.Copy() 1128 update.DesiredStatus = structs.AllocDesiredStatusStop 1129 ar.Update(update) 1130 1131 testutil.WaitForResult(func() (bool, error) { 1132 last := upd.Last() 1133 if last == nil { 1134 return false, fmt.Errorf("No updates") 1135 } 1136 1137 // Check the status has changed. 1138 if last.ClientStatus != structs.AllocClientStatusComplete { 1139 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1140 } 1141 1142 // Check the alloc directory still exists 1143 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 1144 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 1145 } 1146 1147 return true, nil 1148 }, func(err error) { 1149 require.Fail(t, "err: %v", err) 1150 }) 1151 1152 // Send the destroy signal and ensure the AllocRunner cleans up. 1153 ar.Destroy() 1154 1155 testutil.WaitForResult(func() (bool, error) { 1156 last := upd.Last() 1157 if last == nil { 1158 return false, fmt.Errorf("No updates") 1159 } 1160 1161 // Check the status has changed. 1162 if last.ClientStatus != structs.AllocClientStatusComplete { 1163 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1164 } 1165 1166 // Check the alloc directory was cleaned 1167 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 1168 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 1169 } else if !os.IsNotExist(err) { 1170 return false, fmt.Errorf("stat err: %v", err) 1171 } 1172 1173 return true, nil 1174 }, func(err error) { 1175 require.Fail(t, "err: %v", err) 1176 }) 1177 } 1178 1179 // TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore 1180 func TestAllocRunner_PersistState_Destroyed(t *testing.T) { 1181 t.Parallel() 1182 1183 alloc := mock.BatchAlloc() 1184 taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name 1185 1186 conf, cleanup := testAllocRunnerConfig(t, alloc) 1187 conf.StateDB = state.NewMemDB(conf.Logger) 1188 1189 defer cleanup() 1190 ar, err := NewAllocRunner(conf) 1191 require.NoError(t, err) 1192 defer destroy(ar) 1193 1194 go ar.Run() 1195 1196 select { 1197 case <-ar.WaitCh(): 1198 case <-time.After(10 * time.Second): 1199 require.Fail(t, "timed out waiting for alloc to complete") 1200 } 1201 1202 // test final persisted state upon completion 1203 require.NoError(t, ar.PersistState()) 1204 allocs, _, err := conf.StateDB.GetAllAllocations() 1205 require.NoError(t, err) 1206 require.Len(t, allocs, 1) 1207 require.Equal(t, alloc.ID, allocs[0].ID) 1208 _, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) 1209 require.NoError(t, err) 1210 require.Equal(t, structs.TaskStateDead, ts.State) 1211 1212 // check that DB alloc is empty after destroying AR 1213 ar.Destroy() 1214 select { 1215 case <-ar.DestroyCh(): 1216 case <-time.After(10 * time.Second): 1217 require.Fail(t, "timedout waiting for destruction") 1218 } 1219 1220 allocs, _, err = conf.StateDB.GetAllAllocations() 1221 require.NoError(t, err) 1222 require.Empty(t, allocs) 1223 _, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) 1224 require.NoError(t, err) 1225 require.Nil(t, ts) 1226 1227 // check that DB alloc is empty after persisting state of destroyed AR 1228 ar.PersistState() 1229 allocs, _, err = conf.StateDB.GetAllAllocations() 1230 require.NoError(t, err) 1231 require.Empty(t, allocs) 1232 _, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) 1233 require.NoError(t, err) 1234 require.Nil(t, ts) 1235 }