github.com/manicqin/nomad@v0.9.5/client/allocrunner/alloc_runner_test.go (about) 1 package allocrunner 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "path/filepath" 8 "testing" 9 "time" 10 11 "github.com/hashicorp/consul/api" 12 "github.com/hashicorp/nomad/client/allochealth" 13 "github.com/hashicorp/nomad/client/allocwatcher" 14 cconsul "github.com/hashicorp/nomad/client/consul" 15 "github.com/hashicorp/nomad/client/state" 16 "github.com/hashicorp/nomad/command/agent/consul" 17 "github.com/hashicorp/nomad/helper/uuid" 18 "github.com/hashicorp/nomad/nomad/mock" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/nomad/testutil" 21 "github.com/stretchr/testify/require" 22 ) 23 24 // destroy does a blocking destroy on an alloc runner 25 func destroy(ar *allocRunner) { 26 ar.Destroy() 27 <-ar.DestroyCh() 28 } 29 30 // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via 31 // AllocState() are initialized even before the AllocRunner has run. 32 func TestAllocRunner_AllocState_Initialized(t *testing.T) { 33 t.Parallel() 34 35 alloc := mock.Alloc() 36 alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 37 conf, cleanup := testAllocRunnerConfig(t, alloc) 38 defer cleanup() 39 40 ar, err := NewAllocRunner(conf) 41 require.NoError(t, err) 42 43 allocState := ar.AllocState() 44 45 require.NotNil(t, allocState) 46 require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name]) 47 } 48 49 // TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the 50 // entire task group is killed. 51 func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { 52 t.Parallel() 53 54 alloc := mock.BatchAlloc() 55 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 56 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 57 58 // Create two tasks in the task group 59 task := alloc.Job.TaskGroups[0].Tasks[0] 60 task.Name = "task1" 61 task.Driver = "mock_driver" 62 task.KillTimeout = 10 * time.Millisecond 63 task.Config = map[string]interface{}{ 64 "run_for": "10s", 65 } 66 67 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 68 task2.Name = "task2" 69 task2.Driver = "mock_driver" 70 task2.Leader = true 71 task2.Config = map[string]interface{}{ 72 "run_for": "1s", 73 } 74 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 75 alloc.AllocatedResources.Tasks[task.Name] = tr 76 alloc.AllocatedResources.Tasks[task2.Name] = tr 77 78 conf, cleanup := testAllocRunnerConfig(t, alloc) 79 defer cleanup() 80 ar, err := NewAllocRunner(conf) 81 require.NoError(t, err) 82 defer destroy(ar) 83 go ar.Run() 84 85 // Wait for all tasks to be killed 86 upd := conf.StateUpdater.(*MockStateUpdater) 87 testutil.WaitForResult(func() (bool, error) { 88 last := upd.Last() 89 if last == nil { 90 return false, fmt.Errorf("No updates") 91 } 92 if last.ClientStatus != structs.AllocClientStatusComplete { 93 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 94 } 95 96 // Task1 should be killed because Task2 exited 97 state1 := last.TaskStates[task.Name] 98 if state1.State != structs.TaskStateDead { 99 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 100 } 101 if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { 102 return false, fmt.Errorf("expected to have a start and finish time") 103 } 104 if len(state1.Events) < 2 { 105 // At least have a received and destroyed 106 return false, fmt.Errorf("Unexpected number of events") 107 } 108 109 found := false 110 killingMsg := "" 111 for _, e := range state1.Events { 112 if e.Type != structs.TaskLeaderDead { 113 found = true 114 } 115 if e.Type == structs.TaskKilling { 116 killingMsg = e.DisplayMessage 117 } 118 } 119 120 if !found { 121 return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead) 122 } 123 124 expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing" 125 if killingMsg != expectedKillingMsg { 126 return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg) 127 } 128 129 // Task Two should be dead 130 state2 := last.TaskStates[task2.Name] 131 if state2.State != structs.TaskStateDead { 132 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 133 } 134 if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { 135 return false, fmt.Errorf("expected to have a start and finish time") 136 } 137 138 return true, nil 139 }, func(err error) { 140 t.Fatalf("err: %v", err) 141 }) 142 } 143 144 func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) { 145 t.Parallel() 146 147 alloc := mock.Alloc() 148 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 149 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 150 151 // Create a group service 152 tg := alloc.Job.TaskGroups[0] 153 tg.Services = []*structs.Service{ 154 { 155 Name: "shutdown_service", 156 }, 157 } 158 159 // Create two tasks in the group 160 task := alloc.Job.TaskGroups[0].Tasks[0] 161 task.Name = "follower1" 162 task.Driver = "mock_driver" 163 task.Config = map[string]interface{}{ 164 "run_for": "10s", 165 } 166 167 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 168 task2.Name = "leader" 169 task2.Driver = "mock_driver" 170 task2.Leader = true 171 task2.Config = map[string]interface{}{ 172 "run_for": "10s", 173 } 174 175 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 176 alloc.AllocatedResources.Tasks[task.Name] = tr 177 alloc.AllocatedResources.Tasks[task2.Name] = tr 178 179 // Set a shutdown delay 180 shutdownDelay := 1 * time.Second 181 alloc.Job.TaskGroups[0].ShutdownDelay = &shutdownDelay 182 183 conf, cleanup := testAllocRunnerConfig(t, alloc) 184 defer cleanup() 185 ar, err := NewAllocRunner(conf) 186 require.NoError(t, err) 187 defer destroy(ar) 188 go ar.Run() 189 190 // Wait for tasks to start 191 upd := conf.StateUpdater.(*MockStateUpdater) 192 last := upd.Last() 193 testutil.WaitForResult(func() (bool, error) { 194 last = upd.Last() 195 if last == nil { 196 return false, fmt.Errorf("No updates") 197 } 198 if n := len(last.TaskStates); n != 2 { 199 return false, fmt.Errorf("Not enough task states (want: 2; found %d)", n) 200 } 201 for name, state := range last.TaskStates { 202 if state.State != structs.TaskStateRunning { 203 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 204 } 205 } 206 return true, nil 207 }, func(err error) { 208 t.Fatalf("err: %v", err) 209 }) 210 211 // Reset updates 212 upd.Reset() 213 214 // Stop alloc 215 shutdownInit := time.Now() 216 update := alloc.Copy() 217 update.DesiredStatus = structs.AllocDesiredStatusStop 218 ar.Update(update) 219 220 // Wait for tasks to stop 221 testutil.WaitForResult(func() (bool, error) { 222 last := upd.Last() 223 if last == nil { 224 return false, fmt.Errorf("No updates") 225 } 226 227 fin := last.TaskStates["leader"].FinishedAt 228 229 if fin.IsZero() { 230 return false, nil 231 } 232 233 return true, nil 234 }, func(err error) { 235 last := upd.Last() 236 for name, state := range last.TaskStates { 237 t.Logf("%s: %s", name, state.State) 238 } 239 t.Fatalf("err: %v", err) 240 }) 241 242 // Get consul client operations 243 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 244 consulOpts := consulClient.GetOps() 245 var groupRemoveOp cconsul.MockConsulOp 246 for _, op := range consulOpts { 247 // Grab the first deregistration request 248 if op.Op == "remove" && op.Name == "group-web" { 249 groupRemoveOp = op 250 break 251 } 252 } 253 254 // Ensure remove operation is close to shutdown initiation 255 require.True(t, groupRemoveOp.OccurredAt.Sub(shutdownInit) < 100*time.Millisecond) 256 257 last = upd.Last() 258 minShutdown := shutdownInit.Add(task.ShutdownDelay) 259 leaderFinished := last.TaskStates["leader"].FinishedAt 260 followerFinished := last.TaskStates["follower1"].FinishedAt 261 262 // Check that both tasks shut down after min possible shutdown time 263 require.Greater(t, leaderFinished.UnixNano(), minShutdown.UnixNano()) 264 require.Greater(t, followerFinished.UnixNano(), minShutdown.UnixNano()) 265 266 // Check that there is at least shutdown_delay between consul 267 // remove operation and task finished at time 268 require.True(t, leaderFinished.Sub(groupRemoveOp.OccurredAt) > shutdownDelay) 269 } 270 271 // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a 272 // leader the leader is stopped before other tasks. 273 func TestAllocRunner_TaskLeader_StopTG(t *testing.T) { 274 t.Parallel() 275 276 alloc := mock.Alloc() 277 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 278 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 279 280 // Create 3 tasks in the task group 281 task := alloc.Job.TaskGroups[0].Tasks[0] 282 task.Name = "follower1" 283 task.Driver = "mock_driver" 284 task.Config = map[string]interface{}{ 285 "run_for": "10s", 286 } 287 288 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 289 task2.Name = "leader" 290 task2.Driver = "mock_driver" 291 task2.Leader = true 292 task2.Config = map[string]interface{}{ 293 "run_for": "10s", 294 } 295 296 task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 297 task3.Name = "follower2" 298 task3.Driver = "mock_driver" 299 task3.Config = map[string]interface{}{ 300 "run_for": "10s", 301 } 302 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3) 303 alloc.AllocatedResources.Tasks[task.Name] = tr 304 alloc.AllocatedResources.Tasks[task2.Name] = tr 305 alloc.AllocatedResources.Tasks[task3.Name] = tr 306 307 conf, cleanup := testAllocRunnerConfig(t, alloc) 308 defer cleanup() 309 ar, err := NewAllocRunner(conf) 310 require.NoError(t, err) 311 defer destroy(ar) 312 go ar.Run() 313 314 // Wait for tasks to start 315 upd := conf.StateUpdater.(*MockStateUpdater) 316 last := upd.Last() 317 testutil.WaitForResult(func() (bool, error) { 318 last = upd.Last() 319 if last == nil { 320 return false, fmt.Errorf("No updates") 321 } 322 if n := len(last.TaskStates); n != 3 { 323 return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n) 324 } 325 for name, state := range last.TaskStates { 326 if state.State != structs.TaskStateRunning { 327 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 328 } 329 } 330 return true, nil 331 }, func(err error) { 332 t.Fatalf("err: %v", err) 333 }) 334 335 // Reset updates 336 upd.Reset() 337 338 // Stop alloc 339 update := alloc.Copy() 340 update.DesiredStatus = structs.AllocDesiredStatusStop 341 ar.Update(update) 342 343 // Wait for tasks to stop 344 testutil.WaitForResult(func() (bool, error) { 345 last := upd.Last() 346 if last == nil { 347 return false, fmt.Errorf("No updates") 348 } 349 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() { 350 return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s", 351 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt) 352 } 353 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() { 354 return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s", 355 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt) 356 } 357 return true, nil 358 }, func(err error) { 359 last := upd.Last() 360 for name, state := range last.TaskStates { 361 t.Logf("%s: %s", name, state.State) 362 } 363 t.Fatalf("err: %v", err) 364 }) 365 } 366 367 // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a 368 // restored task group with a leader that failed before restoring the leader is 369 // not stopped as it does not exist. 370 // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932 371 func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) { 372 t.Parallel() 373 374 alloc := mock.Alloc() 375 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 376 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 377 378 // Create a leader and follower task in the task group 379 task := alloc.Job.TaskGroups[0].Tasks[0] 380 task.Name = "follower1" 381 task.Driver = "mock_driver" 382 task.KillTimeout = 10 * time.Second 383 task.Config = map[string]interface{}{ 384 "run_for": "10s", 385 } 386 387 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 388 task2.Name = "leader" 389 task2.Driver = "mock_driver" 390 task2.Leader = true 391 task2.KillTimeout = 10 * time.Millisecond 392 task2.Config = map[string]interface{}{ 393 "run_for": "10s", 394 } 395 396 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 397 alloc.AllocatedResources.Tasks[task.Name] = tr 398 alloc.AllocatedResources.Tasks[task2.Name] = tr 399 400 conf, cleanup := testAllocRunnerConfig(t, alloc) 401 defer cleanup() 402 403 // Use a memory backed statedb 404 conf.StateDB = state.NewMemDB(conf.Logger) 405 406 ar, err := NewAllocRunner(conf) 407 require.NoError(t, err) 408 409 // Mimic Nomad exiting before the leader stopping is able to stop other tasks. 410 ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) 411 ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 412 413 // Create a new AllocRunner to test RestoreState and Run 414 ar2, err := NewAllocRunner(conf) 415 require.NoError(t, err) 416 defer destroy(ar2) 417 418 if err := ar2.Restore(); err != nil { 419 t.Fatalf("error restoring state: %v", err) 420 } 421 ar2.Run() 422 423 // Wait for tasks to be stopped because leader is dead 424 testutil.WaitForResult(func() (bool, error) { 425 alloc := ar2.Alloc() 426 for task, state := range alloc.TaskStates { 427 if state.State != structs.TaskStateDead { 428 return false, fmt.Errorf("Task %q should be dead: %v", task, state.State) 429 } 430 } 431 return true, nil 432 }, func(err error) { 433 t.Fatalf("err: %v", err) 434 }) 435 436 // Make sure it GCs properly 437 ar2.Destroy() 438 439 select { 440 case <-ar2.DestroyCh(): 441 // exited as expected 442 case <-time.After(10 * time.Second): 443 t.Fatalf("timed out waiting for AR to GC") 444 } 445 } 446 447 func TestAllocRunner_Update_Semantics(t *testing.T) { 448 t.Parallel() 449 require := require.New(t) 450 451 updatedAlloc := func(a *structs.Allocation) *structs.Allocation { 452 upd := a.CopySkipJob() 453 upd.AllocModifyIndex++ 454 455 return upd 456 } 457 458 alloc := mock.Alloc() 459 alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 460 conf, cleanup := testAllocRunnerConfig(t, alloc) 461 defer cleanup() 462 463 ar, err := NewAllocRunner(conf) 464 require.NoError(err) 465 466 upd1 := updatedAlloc(alloc) 467 ar.Update(upd1) 468 469 // Update was placed into a queue 470 require.Len(ar.allocUpdatedCh, 1) 471 472 upd2 := updatedAlloc(alloc) 473 ar.Update(upd2) 474 475 // Allocation was _replaced_ 476 477 require.Len(ar.allocUpdatedCh, 1) 478 queuedAlloc := <-ar.allocUpdatedCh 479 require.Equal(upd2, queuedAlloc) 480 481 // Requeueing older alloc is skipped 482 ar.Update(upd2) 483 ar.Update(upd1) 484 485 queuedAlloc = <-ar.allocUpdatedCh 486 require.Equal(upd2, queuedAlloc) 487 488 // Ignore after watch closed 489 490 close(ar.waitCh) 491 492 ar.Update(upd1) 493 494 // Did not queue the update 495 require.Len(ar.allocUpdatedCh, 0) 496 } 497 498 // TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is 499 // reported for services that got migrated; not just part of deployments. 500 func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) { 501 t.Parallel() 502 503 alloc := mock.Alloc() 504 505 // Ensure the alloc is *not* part of a deployment 506 alloc.DeploymentID = "" 507 508 // Shorten the default migration healthy time 509 tg := alloc.Job.TaskGroups[0] 510 tg.Migrate = structs.DefaultMigrateStrategy() 511 tg.Migrate.MinHealthyTime = 100 * time.Millisecond 512 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 513 514 task := tg.Tasks[0] 515 task.Driver = "mock_driver" 516 task.Config = map[string]interface{}{ 517 "run_for": "30s", 518 } 519 520 conf, cleanup := testAllocRunnerConfig(t, alloc) 521 defer cleanup() 522 523 ar, err := NewAllocRunner(conf) 524 require.NoError(t, err) 525 go ar.Run() 526 defer destroy(ar) 527 528 upd := conf.StateUpdater.(*MockStateUpdater) 529 testutil.WaitForResult(func() (bool, error) { 530 last := upd.Last() 531 if last == nil { 532 return false, fmt.Errorf("No updates") 533 } 534 if !last.DeploymentStatus.HasHealth() { 535 return false, fmt.Errorf("want deployment status unhealthy; got unset") 536 } else if !*last.DeploymentStatus.Healthy { 537 // This is fatal 538 t.Fatal("want deployment status healthy; got unhealthy") 539 } 540 return true, nil 541 }, func(err error) { 542 require.NoError(t, err) 543 }) 544 } 545 546 // TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health 547 // watcher will mark the allocation as healthy based on task states alone. 548 func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { 549 t.Parallel() 550 551 alloc := mock.Alloc() 552 553 task := alloc.Job.TaskGroups[0].Tasks[0] 554 task.Driver = "mock_driver" 555 task.Config = map[string]interface{}{ 556 "run_for": "10s", 557 } 558 559 // Create a task that takes longer to become healthy 560 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy()) 561 alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy() 562 task2 := alloc.Job.TaskGroups[0].Tasks[1] 563 task2.Name = "task2" 564 task2.Config["start_block_for"] = "500ms" 565 566 // Make the alloc be part of a deployment that uses task states for 567 // health checks 568 alloc.DeploymentID = uuid.Generate() 569 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 570 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 571 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 572 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 573 574 conf, cleanup := testAllocRunnerConfig(t, alloc) 575 defer cleanup() 576 577 ar, err := NewAllocRunner(conf) 578 require.NoError(t, err) 579 580 start, done := time.Now(), time.Time{} 581 go ar.Run() 582 defer destroy(ar) 583 584 upd := conf.StateUpdater.(*MockStateUpdater) 585 testutil.WaitForResult(func() (bool, error) { 586 last := upd.Last() 587 if last == nil { 588 return false, fmt.Errorf("No updates") 589 } 590 if !last.DeploymentStatus.HasHealth() { 591 return false, fmt.Errorf("want deployment status unhealthy; got unset") 592 } else if !*last.DeploymentStatus.Healthy { 593 // This is fatal 594 t.Fatal("want deployment status healthy; got unhealthy") 595 } 596 597 // Capture the done timestamp 598 done = last.DeploymentStatus.Timestamp 599 return true, nil 600 }, func(err error) { 601 require.NoError(t, err) 602 }) 603 604 if d := done.Sub(start); d < 500*time.Millisecond { 605 t.Fatalf("didn't wait for second task group. Only took %v", d) 606 } 607 } 608 609 // TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health 610 // watcher will mark the allocation as unhealthy with failing checks. 611 func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { 612 t.Parallel() 613 614 alloc := mock.Alloc() 615 task := alloc.Job.TaskGroups[0].Tasks[0] 616 task.Driver = "mock_driver" 617 task.Config = map[string]interface{}{ 618 "run_for": "10s", 619 } 620 621 // Set a service with check 622 task.Services = []*structs.Service{ 623 { 624 Name: "fakservice", 625 PortLabel: "http", 626 Checks: []*structs.ServiceCheck{ 627 { 628 Name: "fakecheck", 629 Type: structs.ServiceCheckScript, 630 Command: "true", 631 Interval: 30 * time.Second, 632 Timeout: 5 * time.Second, 633 }, 634 }, 635 }, 636 } 637 638 // Make the alloc be part of a deployment 639 alloc.DeploymentID = uuid.Generate() 640 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 641 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 642 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 643 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 644 alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second 645 646 checkUnhealthy := &api.AgentCheck{ 647 CheckID: uuid.Generate(), 648 Status: api.HealthWarning, 649 } 650 651 conf, cleanup := testAllocRunnerConfig(t, alloc) 652 defer cleanup() 653 654 // Only return the check as healthy after a duration 655 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 656 consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 657 return &consul.AllocRegistration{ 658 Tasks: map[string]*consul.ServiceRegistrations{ 659 task.Name: { 660 Services: map[string]*consul.ServiceRegistration{ 661 "123": { 662 Service: &api.AgentService{Service: "fakeservice"}, 663 Checks: []*api.AgentCheck{checkUnhealthy}, 664 }, 665 }, 666 }, 667 }, 668 }, nil 669 } 670 671 ar, err := NewAllocRunner(conf) 672 require.NoError(t, err) 673 go ar.Run() 674 defer destroy(ar) 675 676 var lastUpdate *structs.Allocation 677 upd := conf.StateUpdater.(*MockStateUpdater) 678 testutil.WaitForResult(func() (bool, error) { 679 lastUpdate = upd.Last() 680 if lastUpdate == nil { 681 return false, fmt.Errorf("No updates") 682 } 683 if !lastUpdate.DeploymentStatus.HasHealth() { 684 return false, fmt.Errorf("want deployment status unhealthy; got unset") 685 } else if *lastUpdate.DeploymentStatus.Healthy { 686 // This is fatal 687 t.Fatal("want deployment status unhealthy; got healthy") 688 } 689 return true, nil 690 }, func(err error) { 691 require.NoError(t, err) 692 }) 693 694 // Assert that we have an event explaining why we are unhealthy. 695 require.Len(t, lastUpdate.TaskStates, 1) 696 state := lastUpdate.TaskStates[task.Name] 697 require.NotNil(t, state) 698 require.NotEmpty(t, state.Events) 699 last := state.Events[len(state.Events)-1] 700 require.Equal(t, allochealth.AllocHealthEventSource, last.Type) 701 require.Contains(t, last.Message, "by deadline") 702 } 703 704 // TestAllocRunner_Destroy asserts that Destroy kills and cleans up a running 705 // alloc. 706 func TestAllocRunner_Destroy(t *testing.T) { 707 t.Parallel() 708 709 // Ensure task takes some time 710 alloc := mock.BatchAlloc() 711 task := alloc.Job.TaskGroups[0].Tasks[0] 712 task.Config["run_for"] = "10s" 713 714 conf, cleanup := testAllocRunnerConfig(t, alloc) 715 defer cleanup() 716 717 // Use a MemDB to assert alloc state gets cleaned up 718 conf.StateDB = state.NewMemDB(conf.Logger) 719 720 ar, err := NewAllocRunner(conf) 721 require.NoError(t, err) 722 go ar.Run() 723 724 // Wait for alloc to be running 725 testutil.WaitForResult(func() (bool, error) { 726 state := ar.AllocState() 727 728 return state.ClientStatus == structs.AllocClientStatusRunning, 729 fmt.Errorf("got client status %v; want running", state.ClientStatus) 730 }, func(err error) { 731 require.NoError(t, err) 732 }) 733 734 // Assert state was stored 735 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 736 require.NoError(t, err) 737 require.NotNil(t, ls) 738 require.NotNil(t, ts) 739 740 // Now destroy 741 ar.Destroy() 742 743 select { 744 case <-ar.DestroyCh(): 745 // Destroyed properly! 746 case <-time.After(10 * time.Second): 747 require.Fail(t, "timed out waiting for alloc to be destroyed") 748 } 749 750 // Assert alloc is dead 751 state := ar.AllocState() 752 require.Equal(t, structs.AllocClientStatusComplete, state.ClientStatus) 753 754 // Assert the state was cleaned 755 ls, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 756 require.NoError(t, err) 757 require.Nil(t, ls) 758 require.Nil(t, ts) 759 760 // Assert the alloc directory was cleaned 761 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 762 require.Fail(t, "alloc dir still exists: %v", ar.allocDir.AllocDir) 763 } else if !os.IsNotExist(err) { 764 require.Failf(t, "expected NotExist error", "found %v", err) 765 } 766 } 767 768 func TestAllocRunner_SimpleRun(t *testing.T) { 769 t.Parallel() 770 771 alloc := mock.BatchAlloc() 772 773 conf, cleanup := testAllocRunnerConfig(t, alloc) 774 defer cleanup() 775 ar, err := NewAllocRunner(conf) 776 require.NoError(t, err) 777 go ar.Run() 778 defer destroy(ar) 779 780 // Wait for alloc to be running 781 testutil.WaitForResult(func() (bool, error) { 782 state := ar.AllocState() 783 784 if state.ClientStatus != structs.AllocClientStatusComplete { 785 return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete) 786 } 787 788 for t, s := range state.TaskStates { 789 if s.FinishedAt.IsZero() { 790 return false, fmt.Errorf("task %q has zero FinishedAt value", t) 791 } 792 } 793 794 return true, nil 795 }, func(err error) { 796 require.NoError(t, err) 797 }) 798 799 } 800 801 // TestAllocRunner_MoveAllocDir asserts that a rescheduled 802 // allocation copies ephemeral disk content from previous alloc run 803 func TestAllocRunner_MoveAllocDir(t *testing.T) { 804 t.Parallel() 805 806 // Step 1: start and run a task 807 alloc := mock.BatchAlloc() 808 conf, cleanup := testAllocRunnerConfig(t, alloc) 809 defer cleanup() 810 ar, err := NewAllocRunner(conf) 811 require.NoError(t, err) 812 ar.Run() 813 defer destroy(ar) 814 815 require.Equal(t, structs.AllocClientStatusComplete, ar.AllocState().ClientStatus) 816 817 // Step 2. Modify its directory 818 task := alloc.Job.TaskGroups[0].Tasks[0] 819 dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file") 820 ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm) 821 taskDir := ar.allocDir.TaskDirs[task.Name] 822 taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file") 823 ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm) 824 825 // Step 3. Start a new alloc 826 alloc2 := mock.BatchAlloc() 827 alloc2.PreviousAllocation = alloc.ID 828 alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true 829 830 conf2, cleanup := testAllocRunnerConfig(t, alloc2) 831 conf2.PrevAllocWatcher, conf2.PrevAllocMigrator = allocwatcher.NewAllocWatcher(allocwatcher.Config{ 832 Alloc: alloc2, 833 PreviousRunner: ar, 834 Logger: conf2.Logger, 835 }) 836 defer cleanup() 837 ar2, err := NewAllocRunner(conf2) 838 require.NoError(t, err) 839 840 ar2.Run() 841 defer destroy(ar2) 842 843 require.Equal(t, structs.AllocClientStatusComplete, ar2.AllocState().ClientStatus) 844 845 // Ensure that data from ar was moved to ar2 846 dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file") 847 fileInfo, _ := os.Stat(dataFile) 848 require.NotNilf(t, fileInfo, "file %q not found", dataFile) 849 850 taskDir = ar2.allocDir.TaskDirs[task.Name] 851 taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file") 852 fileInfo, _ = os.Stat(taskLocalFile) 853 require.NotNilf(t, fileInfo, "file %q not found", dataFile) 854 855 } 856 857 // TestAllocRuner_HandlesArtifactFailure ensures that if one task in a task group is 858 // retrying fetching an artifact, other tasks in the group should be able 859 // to proceed. 860 func TestAllocRunner_HandlesArtifactFailure(t *testing.T) { 861 t.Parallel() 862 863 alloc := mock.BatchAlloc() 864 alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{ 865 Mode: structs.RestartPolicyModeFail, 866 Attempts: 1, 867 Delay: time.Nanosecond, 868 Interval: time.Hour, 869 } 870 871 // Create a new task with a bad artifact 872 badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() 873 badtask.Name = "bad" 874 badtask.Artifacts = []*structs.TaskArtifact{ 875 {GetterSource: "http://127.0.0.1:0/foo/bar/baz"}, 876 } 877 878 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) 879 alloc.AllocatedResources.Tasks["bad"] = &structs.AllocatedTaskResources{ 880 Cpu: structs.AllocatedCpuResources{ 881 CpuShares: 500, 882 }, 883 Memory: structs.AllocatedMemoryResources{ 884 MemoryMB: 256, 885 }, 886 } 887 888 conf, cleanup := testAllocRunnerConfig(t, alloc) 889 defer cleanup() 890 ar, err := NewAllocRunner(conf) 891 require.NoError(t, err) 892 go ar.Run() 893 defer destroy(ar) 894 895 testutil.WaitForResult(func() (bool, error) { 896 state := ar.AllocState() 897 898 switch state.ClientStatus { 899 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed: 900 return true, nil 901 default: 902 return false, fmt.Errorf("got status %v but want terminal", state.ClientStatus) 903 } 904 905 }, func(err error) { 906 require.NoError(t, err) 907 }) 908 909 state := ar.AllocState() 910 require.Equal(t, structs.AllocClientStatusFailed, state.ClientStatus) 911 require.Equal(t, structs.TaskStateDead, state.TaskStates["web"].State) 912 require.True(t, state.TaskStates["web"].Successful()) 913 require.Equal(t, structs.TaskStateDead, state.TaskStates["bad"].State) 914 require.True(t, state.TaskStates["bad"].Failed) 915 } 916 917 // Test that alloc runner kills tasks in task group when another task fails 918 func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { 919 alloc := mock.Alloc() 920 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 921 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 922 923 // Create two tasks in the task group 924 task := alloc.Job.TaskGroups[0].Tasks[0] 925 task.Name = "task1" 926 task.Driver = "mock_driver" 927 task.KillTimeout = 10 * time.Millisecond 928 task.Config = map[string]interface{}{ 929 "run_for": "10s", 930 } 931 // Set a service with check 932 task.Services = []*structs.Service{ 933 { 934 Name: "fakservice", 935 PortLabel: "http", 936 Checks: []*structs.ServiceCheck{ 937 { 938 Name: "fakecheck", 939 Type: structs.ServiceCheckScript, 940 Command: "true", 941 Interval: 30 * time.Second, 942 Timeout: 5 * time.Second, 943 }, 944 }, 945 }, 946 } 947 948 task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() 949 task2.Name = "task 2" 950 task2.Driver = "mock_driver" 951 task2.Config = map[string]interface{}{ 952 "start_error": "fail task please", 953 } 954 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) 955 alloc.AllocatedResources.Tasks[task.Name] = tr 956 alloc.AllocatedResources.Tasks[task2.Name] = tr 957 958 // Make the alloc be part of a deployment 959 alloc.DeploymentID = uuid.Generate() 960 alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 961 alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 962 alloc.Job.TaskGroups[0].Update.MaxParallel = 1 963 alloc.Job.TaskGroups[0].Update.MinHealthyTime = 10 * time.Millisecond 964 alloc.Job.TaskGroups[0].Update.HealthyDeadline = 2 * time.Second 965 966 checkHealthy := &api.AgentCheck{ 967 CheckID: uuid.Generate(), 968 Status: api.HealthPassing, 969 } 970 971 conf, cleanup := testAllocRunnerConfig(t, alloc) 972 defer cleanup() 973 974 consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) 975 consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 976 return &consul.AllocRegistration{ 977 Tasks: map[string]*consul.ServiceRegistrations{ 978 task.Name: { 979 Services: map[string]*consul.ServiceRegistration{ 980 "123": { 981 Service: &api.AgentService{Service: "fakeservice"}, 982 Checks: []*api.AgentCheck{checkHealthy}, 983 }, 984 }, 985 }, 986 }, 987 }, nil 988 } 989 990 ar, err := NewAllocRunner(conf) 991 require.NoError(t, err) 992 defer destroy(ar) 993 go ar.Run() 994 upd := conf.StateUpdater.(*MockStateUpdater) 995 996 testutil.WaitForResult(func() (bool, error) { 997 last := upd.Last() 998 if last == nil { 999 return false, fmt.Errorf("No updates") 1000 } 1001 if last.ClientStatus != structs.AllocClientStatusFailed { 1002 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed) 1003 } 1004 1005 // Task One should be killed 1006 state1 := last.TaskStates[task.Name] 1007 if state1.State != structs.TaskStateDead { 1008 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1009 } 1010 if len(state1.Events) < 2 { 1011 // At least have a received and destroyed 1012 return false, fmt.Errorf("Unexpected number of events") 1013 } 1014 1015 found := false 1016 for _, e := range state1.Events { 1017 if e.Type != structs.TaskSiblingFailed { 1018 found = true 1019 } 1020 } 1021 1022 if !found { 1023 return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed) 1024 } 1025 1026 // Task Two should be failed 1027 state2 := last.TaskStates[task2.Name] 1028 if state2.State != structs.TaskStateDead { 1029 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1030 } 1031 if !state2.Failed { 1032 return false, fmt.Errorf("task2 should have failed") 1033 } 1034 1035 if !last.DeploymentStatus.HasHealth() { 1036 return false, fmt.Errorf("Expected deployment health to be non nil") 1037 } 1038 1039 return true, nil 1040 }, func(err error) { 1041 require.Fail(t, "err: %v", err) 1042 }) 1043 } 1044 1045 // Test that alloc becoming terminal should destroy the alloc runner 1046 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { 1047 t.Parallel() 1048 alloc := mock.BatchAlloc() 1049 tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] 1050 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 1051 // Ensure task takes some time 1052 task := alloc.Job.TaskGroups[0].Tasks[0] 1053 task.Driver = "mock_driver" 1054 task.Config["run_for"] = "10s" 1055 alloc.AllocatedResources.Tasks[task.Name] = tr 1056 1057 conf, cleanup := testAllocRunnerConfig(t, alloc) 1058 defer cleanup() 1059 ar, err := NewAllocRunner(conf) 1060 require.NoError(t, err) 1061 defer destroy(ar) 1062 go ar.Run() 1063 upd := conf.StateUpdater.(*MockStateUpdater) 1064 1065 testutil.WaitForResult(func() (bool, error) { 1066 last := upd.Last() 1067 if last == nil { 1068 return false, fmt.Errorf("No updates") 1069 } 1070 if last.ClientStatus != structs.AllocClientStatusRunning { 1071 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 1072 } 1073 return true, nil 1074 }, func(err error) { 1075 require.Fail(t, "err: %v", err) 1076 }) 1077 1078 // Update the alloc to be terminal which should cause the alloc runner to 1079 // stop the tasks and wait for a destroy. 1080 update := ar.alloc.Copy() 1081 update.DesiredStatus = structs.AllocDesiredStatusStop 1082 ar.Update(update) 1083 1084 testutil.WaitForResult(func() (bool, error) { 1085 last := upd.Last() 1086 if last == nil { 1087 return false, fmt.Errorf("No updates") 1088 } 1089 1090 // Check the status has changed. 1091 if last.ClientStatus != structs.AllocClientStatusComplete { 1092 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1093 } 1094 1095 // Check the alloc directory still exists 1096 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 1097 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 1098 } 1099 1100 return true, nil 1101 }, func(err error) { 1102 require.Fail(t, "err: %v", err) 1103 }) 1104 1105 // Send the destroy signal and ensure the AllocRunner cleans up. 1106 ar.Destroy() 1107 1108 testutil.WaitForResult(func() (bool, error) { 1109 last := upd.Last() 1110 if last == nil { 1111 return false, fmt.Errorf("No updates") 1112 } 1113 1114 // Check the status has changed. 1115 if last.ClientStatus != structs.AllocClientStatusComplete { 1116 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1117 } 1118 1119 // Check the alloc directory was cleaned 1120 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 1121 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 1122 } else if !os.IsNotExist(err) { 1123 return false, fmt.Errorf("stat err: %v", err) 1124 } 1125 1126 return true, nil 1127 }, func(err error) { 1128 require.Fail(t, "err: %v", err) 1129 }) 1130 } 1131 1132 // TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore 1133 func TestAllocRunner_PersistState_Destroyed(t *testing.T) { 1134 t.Parallel() 1135 1136 alloc := mock.BatchAlloc() 1137 taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name 1138 1139 conf, cleanup := testAllocRunnerConfig(t, alloc) 1140 conf.StateDB = state.NewMemDB(conf.Logger) 1141 1142 defer cleanup() 1143 ar, err := NewAllocRunner(conf) 1144 require.NoError(t, err) 1145 defer destroy(ar) 1146 1147 go ar.Run() 1148 1149 select { 1150 case <-ar.WaitCh(): 1151 case <-time.After(10 * time.Second): 1152 require.Fail(t, "timed out waiting for alloc to complete") 1153 } 1154 1155 // test final persisted state upon completion 1156 require.NoError(t, ar.PersistState()) 1157 allocs, _, err := conf.StateDB.GetAllAllocations() 1158 require.NoError(t, err) 1159 require.Len(t, allocs, 1) 1160 require.Equal(t, alloc.ID, allocs[0].ID) 1161 _, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) 1162 require.NoError(t, err) 1163 require.Equal(t, structs.TaskStateDead, ts.State) 1164 1165 // check that DB alloc is empty after destroying AR 1166 ar.Destroy() 1167 select { 1168 case <-ar.DestroyCh(): 1169 case <-time.After(10 * time.Second): 1170 require.Fail(t, "timedout waiting for destruction") 1171 } 1172 1173 allocs, _, err = conf.StateDB.GetAllAllocations() 1174 require.NoError(t, err) 1175 require.Empty(t, allocs) 1176 _, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) 1177 require.NoError(t, err) 1178 require.Nil(t, ts) 1179 1180 // check that DB alloc is empty after persisting state of destroyed AR 1181 ar.PersistState() 1182 allocs, _, err = conf.StateDB.GetAllAllocations() 1183 require.NoError(t, err) 1184 require.Empty(t, allocs) 1185 _, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) 1186 require.NoError(t, err) 1187 require.Nil(t, ts) 1188 }