github.com/djenriquez/nomad-1@v0.8.1/client/alloc_runner_test.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "path/filepath" 8 "strings" 9 "sync" 10 "testing" 11 "text/template" 12 "time" 13 14 "github.com/boltdb/bolt" 15 "github.com/hashicorp/consul/api" 16 "github.com/hashicorp/go-multierror" 17 "github.com/hashicorp/nomad/command/agent/consul" 18 "github.com/hashicorp/nomad/helper/testlog" 19 "github.com/hashicorp/nomad/helper/uuid" 20 "github.com/hashicorp/nomad/nomad/mock" 21 "github.com/hashicorp/nomad/nomad/structs" 22 "github.com/hashicorp/nomad/testutil" 23 "github.com/hashicorp/nomad/version" 24 "github.com/kr/pretty" 25 "github.com/stretchr/testify/assert" 26 27 "github.com/hashicorp/nomad/client/config" 28 "github.com/hashicorp/nomad/client/vaultclient" 29 "github.com/stretchr/testify/require" 30 ) 31 32 type MockAllocStateUpdater struct { 33 Allocs []*structs.Allocation 34 mu sync.Mutex 35 } 36 37 // Update fulfills the TaskStateUpdater interface 38 func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) { 39 m.mu.Lock() 40 m.Allocs = append(m.Allocs, alloc) 41 m.mu.Unlock() 42 } 43 44 // Last returns a copy of the last alloc (or nil) sync'd 45 func (m *MockAllocStateUpdater) Last() *structs.Allocation { 46 m.mu.Lock() 47 defer m.mu.Unlock() 48 n := len(m.Allocs) 49 if n == 0 { 50 return nil 51 } 52 return m.Allocs[n-1].Copy() 53 } 54 55 // allocationBucketExists checks if the allocation bucket was created. 56 func allocationBucketExists(tx *bolt.Tx, allocID string) bool { 57 allocations := tx.Bucket(allocationsBucket) 58 if allocations == nil { 59 return false 60 } 61 62 // Retrieve the specific allocations bucket 63 alloc := allocations.Bucket([]byte(allocID)) 64 return alloc != nil 65 } 66 67 func testAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { 68 conf := config.DefaultConfig() 69 conf.Node = mock.Node() 70 conf.StateDir = os.TempDir() 71 conf.AllocDir = os.TempDir() 72 tmp, _ := ioutil.TempFile("", "state-db") 73 db, _ := bolt.Open(tmp.Name(), 0600, nil) 74 upd := &MockAllocStateUpdater{} 75 if !restarts { 76 *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} 77 alloc.Job.Type = structs.JobTypeBatch 78 } 79 vclient := vaultclient.NewMockVaultClient() 80 ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, newMockConsulServiceClient(t), noopPrevAlloc{}) 81 return upd, ar 82 } 83 84 func testAllocRunner(t *testing.T, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { 85 // Use mock driver 86 alloc := mock.Alloc() 87 task := alloc.Job.TaskGroups[0].Tasks[0] 88 task.Driver = "mock_driver" 89 task.Config["run_for"] = "500ms" 90 return testAllocRunnerFromAlloc(t, alloc, restarts) 91 } 92 93 func TestAllocRunner_SimpleRun(t *testing.T) { 94 t.Parallel() 95 upd, ar := testAllocRunner(t, false) 96 go ar.Run() 97 defer ar.Destroy() 98 99 testutil.WaitForResult(func() (bool, error) { 100 last := upd.Last() 101 if last == nil { 102 return false, fmt.Errorf("No updates") 103 } 104 if last.ClientStatus != structs.AllocClientStatusComplete { 105 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 106 } 107 return true, nil 108 }, func(err error) { 109 t.Fatalf("err: %v", err) 110 }) 111 } 112 113 // Test that FinisheAt is set when the alloc is in a terminal state 114 func TestAllocRunner_FinishedAtSet(t *testing.T) { 115 t.Parallel() 116 require := require.New(t) 117 _, ar := testAllocRunner(t, false) 118 ar.allocClientStatus = structs.AllocClientStatusFailed 119 alloc := ar.Alloc() 120 taskFinishedAt := make(map[string]time.Time) 121 require.NotEmpty(alloc.TaskStates) 122 for name, s := range alloc.TaskStates { 123 require.False(s.FinishedAt.IsZero()) 124 taskFinishedAt[name] = s.FinishedAt 125 } 126 127 // Verify that calling again should not mutate finishedAt 128 alloc2 := ar.Alloc() 129 for name, s := range alloc2.TaskStates { 130 require.Equal(taskFinishedAt[name], s.FinishedAt) 131 } 132 133 } 134 135 // Test that FinisheAt is set when the alloc is in a terminal state 136 func TestAllocRunner_FinishedAtSet_TaskEvents(t *testing.T) { 137 t.Parallel() 138 require := require.New(t) 139 _, ar := testAllocRunner(t, false) 140 ar.taskStates[ar.alloc.Job.TaskGroups[0].Tasks[0].Name] = &structs.TaskState{State: structs.TaskStateDead, Failed: true} 141 142 alloc := ar.Alloc() 143 taskFinishedAt := make(map[string]time.Time) 144 require.NotEmpty(alloc.TaskStates) 145 for name, s := range alloc.TaskStates { 146 require.False(s.FinishedAt.IsZero()) 147 taskFinishedAt[name] = s.FinishedAt 148 } 149 150 // Verify that calling again should not mutate finishedAt 151 alloc2 := ar.Alloc() 152 for name, s := range alloc2.TaskStates { 153 require.Equal(taskFinishedAt[name], s.FinishedAt) 154 } 155 156 } 157 158 // Test that the watcher will mark the allocation as unhealthy. 159 func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) { 160 t.Parallel() 161 assert := assert.New(t) 162 163 // Ensure the task fails and restarts 164 upd, ar := testAllocRunner(t, true) 165 166 // Make the task fail 167 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 168 task.Driver = "mock_driver" 169 task.Config["start_error"] = "test error" 170 171 // Make the alloc be part of a deployment 172 ar.alloc.DeploymentID = uuid.Generate() 173 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 174 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 175 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 176 177 go ar.Run() 178 defer ar.Destroy() 179 180 testutil.WaitForResult(func() (bool, error) { 181 last := upd.Last() 182 if last == nil { 183 return false, fmt.Errorf("No updates") 184 } 185 if !last.DeploymentStatus.HasHealth() { 186 return false, fmt.Errorf("want deployment status unhealthy; got unset") 187 } else if *last.DeploymentStatus.Healthy { 188 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 189 } 190 return true, nil 191 }, func(err error) { 192 t.Fatalf("err: %v", err) 193 }) 194 195 // Assert that we have an event explaining why we are unhealthy. 196 assert.Len(ar.taskStates, 1) 197 state := ar.taskStates[task.Name] 198 assert.NotNil(state) 199 assert.NotEmpty(state.Events) 200 last := state.Events[len(state.Events)-1] 201 assert.Equal(allocHealthEventSource, last.Type) 202 assert.Contains(last.Message, "failed task") 203 } 204 205 // Test that the watcher will mark the allocation as unhealthy if it hits its 206 // deadline. 207 func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) { 208 t.Parallel() 209 assert := assert.New(t) 210 211 // Ensure the task fails and restarts 212 upd, ar := testAllocRunner(t, true) 213 214 // Make the task block 215 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 216 task.Driver = "mock_driver" 217 task.Config["start_block_for"] = "4s" 218 task.Config["run_for"] = "10s" 219 220 // Make the alloc be part of a deployment 221 ar.alloc.DeploymentID = uuid.Generate() 222 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 223 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 224 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 225 ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond 226 227 go ar.Run() 228 defer ar.Destroy() 229 230 testutil.WaitForResult(func() (bool, error) { 231 last := upd.Last() 232 if last == nil { 233 return false, fmt.Errorf("No updates") 234 } 235 if !last.DeploymentStatus.HasHealth() { 236 return false, fmt.Errorf("want deployment status unhealthy; got unset") 237 } else if *last.DeploymentStatus.Healthy { 238 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 239 } 240 return true, nil 241 }, func(err error) { 242 t.Fatalf("err: %v", err) 243 }) 244 245 // Assert that we have an event explaining why we are unhealthy. 246 assert.Len(ar.taskStates, 1) 247 state := ar.taskStates[task.Name] 248 assert.NotNil(state) 249 assert.NotEmpty(state.Events) 250 last := state.Events[len(state.Events)-1] 251 assert.Equal(allocHealthEventSource, last.Type) 252 assert.Contains(last.Message, "not running by deadline") 253 } 254 255 // Test that the watcher will mark the allocation as healthy. 256 func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { 257 t.Parallel() 258 259 // Ensure the task fails and restarts 260 upd, ar := testAllocRunner(t, true) 261 262 // Make the task run healthy 263 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 264 task.Driver = "mock_driver" 265 task.Config["run_for"] = "10s" 266 267 // Create a task that takes longer to become healthy 268 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy()) 269 task2 := ar.alloc.Job.TaskGroups[0].Tasks[1] 270 task2.Name = "task 2" 271 task2.Config["start_block_for"] = "500ms" 272 273 // Make the alloc be part of a deployment 274 ar.alloc.DeploymentID = uuid.Generate() 275 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 276 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 277 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 278 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 279 280 start := time.Now() 281 go ar.Run() 282 defer ar.Destroy() 283 284 testutil.WaitForResult(func() (bool, error) { 285 last := upd.Last() 286 if last == nil { 287 return false, fmt.Errorf("No updates") 288 } 289 if !last.DeploymentStatus.HasHealth() { 290 return false, fmt.Errorf("want deployment status unhealthy; got unset") 291 } else if !*last.DeploymentStatus.Healthy { 292 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 293 } 294 return true, nil 295 }, func(err error) { 296 t.Fatalf("err: %v", err) 297 }) 298 if d := time.Now().Sub(start); d < 500*time.Millisecond { 299 t.Fatalf("didn't wait for second task group. Only took %v", d) 300 } 301 } 302 303 // Test that the watcher will mark the allocation as healthy with checks 304 func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) { 305 t.Parallel() 306 307 // Ensure the task fails and restarts 308 upd, ar := testAllocRunner(t, true) 309 310 // Make the task fail 311 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 312 task.Driver = "mock_driver" 313 task.Config["run_for"] = "10s" 314 315 // Create a task that has no checks 316 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy()) 317 task2 := ar.alloc.Job.TaskGroups[0].Tasks[1] 318 task2.Name = "task 2" 319 task2.Services = nil 320 321 // Make the alloc be part of a deployment 322 ar.alloc.DeploymentID = uuid.Generate() 323 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 324 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 325 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 326 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 327 328 checkHealthy := &api.AgentCheck{ 329 CheckID: uuid.Generate(), 330 Status: api.HealthPassing, 331 } 332 checkUnhealthy := &api.AgentCheck{ 333 CheckID: checkHealthy.CheckID, 334 Status: api.HealthWarning, 335 } 336 337 // Only return the check as healthy after a duration 338 trigger := time.After(500 * time.Millisecond) 339 ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 340 select { 341 case <-trigger: 342 return &consul.AllocRegistration{ 343 Tasks: map[string]*consul.TaskRegistration{ 344 task.Name: { 345 Services: map[string]*consul.ServiceRegistration{ 346 "123": { 347 Service: &api.AgentService{Service: "foo"}, 348 Checks: []*api.AgentCheck{checkHealthy}, 349 }, 350 }, 351 }, 352 }, 353 }, nil 354 default: 355 return &consul.AllocRegistration{ 356 Tasks: map[string]*consul.TaskRegistration{ 357 task.Name: { 358 Services: map[string]*consul.ServiceRegistration{ 359 "123": { 360 Service: &api.AgentService{Service: "foo"}, 361 Checks: []*api.AgentCheck{checkUnhealthy}, 362 }, 363 }, 364 }, 365 }, 366 }, nil 367 } 368 } 369 370 start := time.Now() 371 go ar.Run() 372 defer ar.Destroy() 373 374 testutil.WaitForResult(func() (bool, error) { 375 last := upd.Last() 376 if last == nil { 377 return false, fmt.Errorf("No updates") 378 } 379 if !last.DeploymentStatus.HasHealth() { 380 return false, fmt.Errorf("want deployment status unhealthy; got unset") 381 } else if !*last.DeploymentStatus.Healthy { 382 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 383 } 384 return true, nil 385 }, func(err error) { 386 t.Fatalf("err: %v", err) 387 }) 388 389 if d := time.Now().Sub(start); d < 500*time.Millisecond { 390 t.Fatalf("didn't wait for second task group. Only took %v", d) 391 } 392 } 393 394 // Test that the watcher will mark the allocation as unhealthy with failing 395 // checks 396 func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { 397 t.Parallel() 398 assert := assert.New(t) 399 400 // Ensure the task fails and restarts 401 upd, ar := testAllocRunner(t, true) 402 403 // Make the task fail 404 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 405 task.Driver = "mock_driver" 406 task.Config["run_for"] = "10s" 407 408 // Make the alloc be part of a deployment 409 ar.alloc.DeploymentID = uuid.Generate() 410 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 411 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 412 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 413 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 414 ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second 415 416 checkUnhealthy := &api.AgentCheck{ 417 CheckID: uuid.Generate(), 418 Status: api.HealthWarning, 419 } 420 421 // Only return the check as healthy after a duration 422 ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 423 return &consul.AllocRegistration{ 424 Tasks: map[string]*consul.TaskRegistration{ 425 task.Name: { 426 Services: map[string]*consul.ServiceRegistration{ 427 "123": { 428 Service: &api.AgentService{Service: "foo"}, 429 Checks: []*api.AgentCheck{checkUnhealthy}, 430 }, 431 }, 432 }, 433 }, 434 }, nil 435 } 436 437 go ar.Run() 438 defer ar.Destroy() 439 440 testutil.WaitForResult(func() (bool, error) { 441 last := upd.Last() 442 if last == nil { 443 return false, fmt.Errorf("No updates") 444 } 445 if !last.DeploymentStatus.HasHealth() { 446 return false, fmt.Errorf("want deployment status unhealthy; got unset") 447 } else if *last.DeploymentStatus.Healthy { 448 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 449 } 450 return true, nil 451 }, func(err error) { 452 t.Fatalf("err: %v", err) 453 }) 454 455 // Assert that we have an event explaining why we are unhealthy. 456 assert.Len(ar.taskStates, 1) 457 state := ar.taskStates[task.Name] 458 assert.NotNil(state) 459 assert.NotEmpty(state.Events) 460 last := state.Events[len(state.Events)-1] 461 assert.Equal(allocHealthEventSource, last.Type) 462 assert.Contains(last.Message, "Services not healthy by deadline") 463 } 464 465 // Test that the watcher will mark the allocation as healthy. 466 func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) { 467 t.Parallel() 468 469 // Ensure the task fails and restarts 470 upd, ar := testAllocRunner(t, true) 471 472 // Make the task run healthy 473 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 474 task.Driver = "mock_driver" 475 task.Config["run_for"] = "30s" 476 477 // Make the alloc be part of a deployment 478 ar.alloc.DeploymentID = uuid.Generate() 479 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 480 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 481 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 482 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 483 484 go ar.Run() 485 defer ar.Destroy() 486 487 testutil.WaitForResult(func() (bool, error) { 488 last := upd.Last() 489 if last == nil { 490 return false, fmt.Errorf("No updates") 491 } 492 if !last.DeploymentStatus.HasHealth() { 493 return false, fmt.Errorf("want deployment status unhealthy; got unset") 494 } else if !*last.DeploymentStatus.Healthy { 495 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 496 } 497 return true, nil 498 }, func(err error) { 499 t.Fatalf("err: %v", err) 500 }) 501 502 // Mimick an update to a new deployment id 503 last := upd.Last() 504 last.DeploymentStatus = nil 505 last.DeploymentID = uuid.Generate() 506 ar.Update(last) 507 508 testutil.WaitForResult(func() (bool, error) { 509 last := upd.Last() 510 if !last.DeploymentStatus.HasHealth() { 511 return false, fmt.Errorf("want deployment status unhealthy; got unset") 512 } else if !*last.DeploymentStatus.Healthy { 513 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 514 } 515 return true, nil 516 }, func(err error) { 517 t.Fatalf("err: %v", err) 518 }) 519 } 520 521 // Test that health is reported for services that got migrated; not just part 522 // of deployments. 523 func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) { 524 t.Parallel() 525 526 // Ensure the task fails and restarts 527 upd, ar := testAllocRunner(t, true) 528 529 // Make the task run healthy 530 tg := ar.alloc.Job.TaskGroups[0] 531 task := tg.Tasks[0] 532 task.Driver = "mock_driver" 533 task.Config["run_for"] = "30s" 534 535 // Shorten the default migration healthy time 536 tg.Migrate = structs.DefaultMigrateStrategy() 537 tg.Migrate.MinHealthyTime = 100 * time.Millisecond 538 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 539 540 // Ensure the alloc is *not* part of a deployment 541 ar.alloc.DeploymentID = "" 542 543 go ar.Run() 544 defer ar.Destroy() 545 546 testutil.WaitForResult(func() (bool, error) { 547 last := upd.Last() 548 if last == nil { 549 return false, fmt.Errorf("No updates") 550 } 551 if !last.DeploymentStatus.HasHealth() { 552 return false, fmt.Errorf("want deployment status unhealthy; got unset") 553 } else if !*last.DeploymentStatus.Healthy { 554 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 555 } 556 return true, nil 557 }, func(err error) { 558 t.Fatalf("err: %v", err) 559 }) 560 } 561 562 // Test that health is *not* reported for batch jobs 563 func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) { 564 t.Parallel() 565 566 // Ensure the task fails and restarts 567 alloc := mock.BatchAlloc() 568 tg := alloc.Job.TaskGroups[0] 569 570 // This should not be possile as validation should prevent batch jobs 571 // from having a migration stanza! 572 tg.Migrate = structs.DefaultMigrateStrategy() 573 tg.Migrate.MinHealthyTime = 1 * time.Millisecond 574 tg.Migrate.HealthyDeadline = 2 * time.Millisecond 575 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 576 577 task := tg.Tasks[0] 578 task.Driver = "mock_driver" 579 task.Config["run_for"] = "5s" 580 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 581 582 go ar.Run() 583 defer ar.Destroy() 584 585 testutil.WaitForResult(func() (bool, error) { 586 last := upd.Last() 587 if last == nil { 588 return false, fmt.Errorf("No updates") 589 } 590 if last.DeploymentStatus != nil { 591 return false, fmt.Errorf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy) 592 } 593 return true, nil 594 }, func(err error) { 595 t.Fatalf("err: %v", err) 596 }) 597 } 598 599 // TestAllocRuner_RetryArtifact ensures that if one task in a task group is 600 // retrying fetching an artifact, other tasks in the group should be able 601 // to proceed. 602 func TestAllocRunner_RetryArtifact(t *testing.T) { 603 t.Parallel() 604 605 alloc := mock.Alloc() 606 alloc.Job.Type = structs.JobTypeBatch 607 alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail 608 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1 609 alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second 610 611 task := alloc.Job.TaskGroups[0].Tasks[0] 612 task.Driver = "mock_driver" 613 task.Config = map[string]interface{}{ 614 "exit_code": "0", 615 "run_for": "1s", 616 } 617 618 // Create a new task with a bad artifact 619 badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() 620 badtask.Name = "bad" 621 badtask.Artifacts = []*structs.TaskArtifact{ 622 {GetterSource: "http://127.0.0.1:0/foo/bar/baz"}, 623 } 624 625 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) 626 upd, ar := testAllocRunnerFromAlloc(t, alloc, true) 627 go ar.Run() 628 defer ar.Destroy() 629 630 testutil.WaitForResult(func() (bool, error) { 631 last := upd.Last() 632 if last == nil { 633 return false, fmt.Errorf("No updates") 634 } 635 636 // web task should have completed successfully while bad task 637 // retries artifact fetching 638 webstate, ok := last.TaskStates["web"] 639 if !ok { 640 return false, fmt.Errorf("no task state for web") 641 } 642 if webstate.State != structs.TaskStateDead { 643 return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State) 644 } 645 if !webstate.Successful() { 646 return false, fmt.Errorf("expected web to have exited successfully") 647 } 648 649 // bad task should have failed 650 badstate := last.TaskStates["bad"] 651 if badstate.State != structs.TaskStateDead { 652 return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State) 653 } 654 if !badstate.Failed { 655 return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events) 656 } 657 return true, nil 658 }, func(err error) { 659 t.Fatalf("err: %v", err) 660 }) 661 } 662 663 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { 664 t.Parallel() 665 upd, ar := testAllocRunner(t, false) 666 667 // Ensure task takes some time 668 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 669 task.Driver = "mock_driver" 670 task.Config["run_for"] = "10s" 671 go ar.Run() 672 673 testutil.WaitForResult(func() (bool, error) { 674 last := upd.Last() 675 if last == nil { 676 return false, fmt.Errorf("No updates") 677 } 678 if last.ClientStatus != structs.AllocClientStatusRunning { 679 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 680 } 681 return true, nil 682 }, func(err error) { 683 t.Fatalf("err: %v", err) 684 }) 685 686 // Update the alloc to be terminal which should cause the alloc runner to 687 // stop the tasks and wait for a destroy. 688 update := ar.alloc.Copy() 689 update.DesiredStatus = structs.AllocDesiredStatusStop 690 ar.Update(update) 691 692 testutil.WaitForResult(func() (bool, error) { 693 last := upd.Last() 694 if last == nil { 695 return false, fmt.Errorf("No updates") 696 } 697 698 // Check the status has changed. 699 if last.ClientStatus != structs.AllocClientStatusComplete { 700 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 701 } 702 703 // Check the allocation state still exists 704 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 705 if !allocationBucketExists(tx, ar.Alloc().ID) { 706 return fmt.Errorf("no bucket for alloc") 707 } 708 709 return nil 710 }); err != nil { 711 return false, fmt.Errorf("state destroyed") 712 } 713 714 // Check the alloc directory still exists 715 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 716 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 717 } 718 719 return true, nil 720 }, func(err error) { 721 t.Fatalf("err: %v", err) 722 }) 723 724 // Send the destroy signal and ensure the AllocRunner cleans up. 725 ar.Destroy() 726 727 testutil.WaitForResult(func() (bool, error) { 728 last := upd.Last() 729 if last == nil { 730 return false, fmt.Errorf("No updates") 731 } 732 733 // Check the status has changed. 734 if last.ClientStatus != structs.AllocClientStatusComplete { 735 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 736 } 737 738 // Check the state was cleaned 739 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 740 if allocationBucketExists(tx, ar.Alloc().ID) { 741 return fmt.Errorf("bucket for alloc exists") 742 } 743 744 return nil 745 }); err != nil { 746 return false, fmt.Errorf("state not destroyed") 747 } 748 749 // Check the alloc directory was cleaned 750 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 751 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 752 } else if !os.IsNotExist(err) { 753 return false, fmt.Errorf("stat err: %v", err) 754 } 755 756 return true, nil 757 }, func(err error) { 758 t.Fatalf("err: %v", err) 759 }) 760 } 761 762 func TestAllocRunner_Destroy(t *testing.T) { 763 t.Parallel() 764 upd, ar := testAllocRunner(t, false) 765 766 // Ensure task takes some time 767 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 768 task.Driver = "mock_driver" 769 task.Config["run_for"] = "10s" 770 go ar.Run() 771 start := time.Now() 772 773 // Begin the tear down 774 go func() { 775 time.Sleep(1 * time.Second) 776 ar.Destroy() 777 }() 778 779 testutil.WaitForResult(func() (bool, error) { 780 last := upd.Last() 781 if last == nil { 782 return false, fmt.Errorf("No updates") 783 } 784 785 // Check the status has changed. 786 if last.ClientStatus != structs.AllocClientStatusComplete { 787 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 788 } 789 790 // Check the state was cleaned 791 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 792 if allocationBucketExists(tx, ar.Alloc().ID) { 793 return fmt.Errorf("bucket for alloc exists") 794 } 795 796 return nil 797 }); err != nil { 798 return false, fmt.Errorf("state not destroyed: %v", err) 799 } 800 801 // Check the alloc directory was cleaned 802 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 803 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 804 } else if !os.IsNotExist(err) { 805 return false, fmt.Errorf("stat err: %v", err) 806 } 807 808 return true, nil 809 }, func(err error) { 810 t.Fatalf("err: %v", err) 811 }) 812 813 if elapsed := time.Since(start); elapsed > 20*time.Second { 814 t.Fatalf("took too long to terminate: %s", elapsed) 815 } 816 } 817 818 func TestAllocRunner_Update(t *testing.T) { 819 t.Parallel() 820 _, ar := testAllocRunner(t, false) 821 822 // Deep copy the alloc to avoid races when updating 823 newAlloc := ar.Alloc().Copy() 824 825 // Ensure task takes some time 826 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 827 task.Driver = "mock_driver" 828 task.Config["run_for"] = "10s" 829 go ar.Run() 830 defer ar.Destroy() 831 832 // Update the alloc definition 833 newAlloc.Name = "FOO" 834 newAlloc.AllocModifyIndex++ 835 ar.Update(newAlloc) 836 837 // Check the alloc runner stores the update allocation. 838 testutil.WaitForResult(func() (bool, error) { 839 return ar.Alloc().Name == "FOO", nil 840 }, func(err error) { 841 t.Fatalf("err: %v %#v", err, ar.Alloc()) 842 }) 843 } 844 845 func TestAllocRunner_SaveRestoreState(t *testing.T) { 846 t.Parallel() 847 alloc := mock.Alloc() 848 task := alloc.Job.TaskGroups[0].Tasks[0] 849 task.Driver = "mock_driver" 850 task.Config = map[string]interface{}{ 851 "exit_code": "0", 852 "run_for": "10s", 853 } 854 855 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 856 go ar.Run() 857 defer ar.Destroy() 858 859 // Snapshot state 860 testutil.WaitForResult(func() (bool, error) { 861 ar.taskLock.RLock() 862 defer ar.taskLock.RUnlock() 863 return len(ar.tasks) == 1, nil 864 }, func(err error) { 865 t.Fatalf("task never started: %v", err) 866 }) 867 868 err := ar.SaveState() 869 if err != nil { 870 t.Fatalf("err: %v", err) 871 } 872 873 // Create a new alloc runner 874 l2 := prefixedTestLogger("----- ar2: ") 875 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 876 prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "") 877 ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update, 878 alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 879 err = ar2.RestoreState() 880 if err != nil { 881 t.Fatalf("err: %v", err) 882 } 883 go ar2.Run() 884 885 testutil.WaitForResult(func() (bool, error) { 886 if len(ar2.tasks) != 1 { 887 return false, fmt.Errorf("Incorrect number of tasks") 888 } 889 890 last := upd.Last() 891 if last == nil { 892 return false, nil 893 } 894 895 return last.ClientStatus == structs.AllocClientStatusRunning, nil 896 }, func(err error) { 897 last := upd.Last() 898 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"]) 899 }) 900 901 // Destroy and wait 902 ar2.Destroy() 903 start := time.Now() 904 905 testutil.WaitForResult(func() (bool, error) { 906 alloc := ar2.Alloc() 907 if alloc.ClientStatus != structs.AllocClientStatusComplete { 908 return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete) 909 } 910 return true, nil 911 }, func(err error) { 912 last := upd.Last() 913 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 914 }) 915 916 if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second { 917 t.Fatalf("took too long to terminate") 918 } 919 } 920 921 func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) { 922 t.Parallel() 923 upd, ar := testAllocRunner(t, false) 924 ar.logger = prefixedTestLogger("ar1: ") 925 926 // Ensure task takes some time 927 ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 928 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 929 task.Config["run_for"] = "10s" 930 go ar.Run() 931 defer ar.Destroy() 932 933 testutil.WaitForResult(func() (bool, error) { 934 last := upd.Last() 935 if last == nil { 936 return false, fmt.Errorf("No updates") 937 } 938 939 if last.ClientStatus != structs.AllocClientStatusRunning { 940 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 941 } 942 return true, nil 943 }, func(err error) { 944 t.Fatalf("err: %v", err) 945 }) 946 947 // Update the alloc to be terminal which should cause the alloc runner to 948 // stop the tasks and wait for a destroy. 949 update := ar.alloc.Copy() 950 update.DesiredStatus = structs.AllocDesiredStatusStop 951 ar.Update(update) 952 953 testutil.WaitForResult(func() (bool, error) { 954 return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil 955 }, func(err error) { 956 t.Fatalf("err: %v", err) 957 }) 958 959 err := ar.SaveState() 960 if err != nil { 961 t.Fatalf("err: %v", err) 962 } 963 964 // Ensure ar1 doesn't recreate the state file 965 ar.allocLock.Lock() 966 defer ar.allocLock.Unlock() 967 968 // Create a new alloc runner 969 l2 := prefixedTestLogger("ar2: ") 970 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 971 prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "") 972 ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update, 973 alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 974 err = ar2.RestoreState() 975 if err != nil { 976 t.Fatalf("err: %v", err) 977 } 978 ar2.logger.Println("[TESTING] running second alloc runner") 979 go ar2.Run() 980 defer ar2.Destroy() // Just-in-case of failure before Destroy below 981 982 testutil.WaitForResult(func() (bool, error) { 983 // Check the state still exists 984 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 985 if !allocationBucketExists(tx, ar2.Alloc().ID) { 986 return fmt.Errorf("no bucket for alloc") 987 } 988 989 return nil 990 }); err != nil { 991 return false, fmt.Errorf("state destroyed") 992 } 993 994 // Check the alloc directory still exists 995 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 996 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 997 } 998 999 return true, nil 1000 }, func(err error) { 1001 last := upd.Last() 1002 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 1003 }) 1004 1005 // Send the destroy signal and ensure the AllocRunner cleans up. 1006 ar2.logger.Println("[TESTING] destroying second alloc runner") 1007 ar2.Destroy() 1008 1009 testutil.WaitForResult(func() (bool, error) { 1010 last := upd.Last() 1011 if last == nil { 1012 return false, fmt.Errorf("No updates") 1013 } 1014 1015 // Check the status has changed. 1016 if last.ClientStatus != structs.AllocClientStatusComplete { 1017 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1018 } 1019 1020 // Check the state was cleaned 1021 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 1022 if allocationBucketExists(tx, ar2.Alloc().ID) { 1023 return fmt.Errorf("bucket for alloc exists") 1024 } 1025 1026 return nil 1027 }); err != nil { 1028 return false, fmt.Errorf("state not destroyed") 1029 } 1030 1031 // Check the alloc directory was cleaned 1032 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 1033 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 1034 } else if !os.IsNotExist(err) { 1035 return false, fmt.Errorf("stat err: %v", err) 1036 } 1037 1038 return true, nil 1039 }, func(err error) { 1040 t.Fatalf("err: %v", err) 1041 }) 1042 } 1043 1044 // TestAllocRunner_SaveRestoreState_Upgrade asserts that pre-0.6 exec tasks are 1045 // restarted on upgrade. 1046 func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) { 1047 t.Parallel() 1048 alloc := mock.Alloc() 1049 task := alloc.Job.TaskGroups[0].Tasks[0] 1050 task.Driver = "mock_driver" 1051 task.Config = map[string]interface{}{ 1052 "exit_code": "0", 1053 "run_for": "10s", 1054 } 1055 1056 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 1057 // Hack in old version to cause an upgrade on RestoreState 1058 origConfig := ar.config.Copy() 1059 ar.config.Version = &version.VersionInfo{Version: "0.5.6"} 1060 go ar.Run() 1061 defer ar.Destroy() 1062 1063 // Snapshot state 1064 testutil.WaitForResult(func() (bool, error) { 1065 last := upd.Last() 1066 if last == nil { 1067 return false, fmt.Errorf("No updates") 1068 } 1069 1070 if last.ClientStatus != structs.AllocClientStatusRunning { 1071 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 1072 } 1073 return true, nil 1074 }, func(err error) { 1075 t.Fatalf("task never started: %v", err) 1076 }) 1077 1078 err := ar.SaveState() 1079 if err != nil { 1080 t.Fatalf("err: %v", err) 1081 } 1082 1083 // Create a new alloc runner 1084 l2 := prefixedTestLogger("ar2: ") 1085 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 1086 prevAlloc := newAllocWatcher(alloc2, ar, nil, origConfig, l2, "") 1087 ar2 := NewAllocRunner(l2, origConfig, ar.stateDB, upd.Update, alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 1088 err = ar2.RestoreState() 1089 if err != nil { 1090 t.Fatalf("err: %v", err) 1091 } 1092 go ar2.Run() 1093 defer ar2.Destroy() // Just-in-case of failure before Destroy below 1094 1095 testutil.WaitForResult(func() (bool, error) { 1096 last := upd.Last() 1097 if last == nil { 1098 return false, fmt.Errorf("No updates") 1099 } 1100 for _, ev := range last.TaskStates["web"].Events { 1101 if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) { 1102 return true, nil 1103 } 1104 } 1105 return false, fmt.Errorf("no restart with proper reason found") 1106 }, func(err error) { 1107 last := upd.Last() 1108 t.Fatalf("err: %v\nweb state: % #v", err, pretty.Formatter(last.TaskStates["web"])) 1109 }) 1110 1111 // Destroy and wait 1112 ar2.Destroy() 1113 start := time.Now() 1114 1115 testutil.WaitForResult(func() (bool, error) { 1116 alloc := ar2.Alloc() 1117 if alloc.ClientStatus != structs.AllocClientStatusComplete { 1118 return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete) 1119 } 1120 return true, nil 1121 }, func(err error) { 1122 last := upd.Last() 1123 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 1124 }) 1125 1126 if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second { 1127 t.Fatalf("took too long to terminate") 1128 } 1129 } 1130 1131 // Ensure pre-#2132 state files containing the Context struct are properly 1132 // migrated to the new format. 1133 // 1134 // Old Context State: 1135 // 1136 // "Context": { 1137 // "AllocDir": { 1138 // "AllocDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb", 1139 // "SharedDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/alloc", 1140 // "TaskDirs": { 1141 // "echo1": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/echo1" 1142 // } 1143 // }, 1144 // "AllocID": "2a54fcff-fc44-8d4f-e025-53c48e9cbbbb" 1145 // } 1146 func TestAllocRunner_RestoreOldState(t *testing.T) { 1147 t.Parallel() 1148 alloc := mock.Alloc() 1149 task := alloc.Job.TaskGroups[0].Tasks[0] 1150 task.Driver = "mock_driver" 1151 task.Config = map[string]interface{}{ 1152 "exit_code": "0", 1153 "run_for": "10s", 1154 } 1155 1156 logger := testLogger() 1157 conf := config.DefaultConfig() 1158 conf.Node = mock.Node() 1159 conf.StateDir = os.TempDir() 1160 conf.AllocDir = os.TempDir() 1161 tmp, err := ioutil.TempFile("", "state-db") 1162 if err != nil { 1163 t.Fatalf("error creating state db file: %v", err) 1164 } 1165 db, err := bolt.Open(tmp.Name(), 0600, nil) 1166 if err != nil { 1167 t.Fatalf("error creating state db: %v", err) 1168 } 1169 1170 if err := os.MkdirAll(filepath.Join(conf.StateDir, "alloc", alloc.ID), 0777); err != nil { 1171 t.Fatalf("error creating state dir: %v", err) 1172 } 1173 statePath := filepath.Join(conf.StateDir, "alloc", alloc.ID, "state.json") 1174 w, err := os.Create(statePath) 1175 if err != nil { 1176 t.Fatalf("error creating state file: %v", err) 1177 } 1178 tmplctx := &struct { 1179 AllocID string 1180 AllocDir string 1181 }{alloc.ID, conf.AllocDir} 1182 err = template.Must(template.New("test_state").Parse(`{ 1183 "Version": "0.5.1", 1184 "Alloc": { 1185 "ID": "{{ .AllocID }}", 1186 "Name": "example", 1187 "JobID": "example", 1188 "Job": { 1189 "ID": "example", 1190 "Name": "example", 1191 "Type": "batch", 1192 "TaskGroups": [ 1193 { 1194 "Name": "example", 1195 "Tasks": [ 1196 { 1197 "Name": "example", 1198 "Driver": "mock", 1199 "Config": { 1200 "exit_code": "0", 1201 "run_for": "10s" 1202 } 1203 } 1204 ] 1205 } 1206 ] 1207 }, 1208 "TaskGroup": "example", 1209 "DesiredStatus": "run", 1210 "ClientStatus": "running", 1211 "TaskStates": { 1212 "example": { 1213 "State": "running", 1214 "Failed": false, 1215 "Events": [] 1216 } 1217 } 1218 }, 1219 "Context": { 1220 "AllocDir": { 1221 "AllocDir": "{{ .AllocDir }}/{{ .AllocID }}", 1222 "SharedDir": "{{ .AllocDir }}/{{ .AllocID }}/alloc", 1223 "TaskDirs": { 1224 "example": "{{ .AllocDir }}/{{ .AllocID }}/example" 1225 } 1226 }, 1227 "AllocID": "{{ .AllocID }}" 1228 } 1229 }`)).Execute(w, tmplctx) 1230 if err != nil { 1231 t.Fatalf("error writing state file: %v", err) 1232 } 1233 w.Close() 1234 1235 upd := &MockAllocStateUpdater{} 1236 *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} 1237 alloc.Job.Type = structs.JobTypeBatch 1238 vclient := vaultclient.NewMockVaultClient() 1239 cclient := newMockConsulServiceClient(t) 1240 ar := NewAllocRunner(logger, conf, db, upd.Update, alloc, vclient, cclient, noopPrevAlloc{}) 1241 defer ar.Destroy() 1242 1243 // RestoreState should fail on the task state since we only test the 1244 // alloc state restoring. 1245 err = ar.RestoreState() 1246 if err == nil { 1247 t.Fatal("expected error restoring Task state") 1248 } 1249 merr, ok := err.(*multierror.Error) 1250 if !ok { 1251 t.Fatalf("expected RestoreState to return a multierror but found: %T -> %v", err, err) 1252 } 1253 if len(merr.Errors) != 1 { 1254 t.Fatalf("expected exactly 1 error from RestoreState but found: %d: %v", len(merr.Errors), err) 1255 } 1256 if expected := "failed to get task bucket"; !strings.Contains(merr.Errors[0].Error(), expected) { 1257 t.Fatalf("expected %q but got: %q", expected, merr.Errors[0].Error()) 1258 } 1259 1260 if err := ar.SaveState(); err != nil { 1261 t.Fatalf("error saving new state: %v", err) 1262 } 1263 } 1264 1265 func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { 1266 t.Parallel() 1267 upd, ar := testAllocRunner(t, false) 1268 1269 // Create two tasks in the task group 1270 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1271 task.Driver = "mock_driver" 1272 task.KillTimeout = 10 * time.Millisecond 1273 task.Config = map[string]interface{}{ 1274 "run_for": "10s", 1275 } 1276 1277 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1278 task2.Name = "task 2" 1279 task2.Driver = "mock_driver" 1280 task2.Config = map[string]interface{}{ 1281 "start_error": "fail task please", 1282 } 1283 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1284 ar.alloc.TaskResources[task2.Name] = task2.Resources 1285 go ar.Run() 1286 defer ar.Destroy() 1287 1288 testutil.WaitForResult(func() (bool, error) { 1289 last := upd.Last() 1290 if last == nil { 1291 return false, fmt.Errorf("No updates") 1292 } 1293 if last.ClientStatus != structs.AllocClientStatusFailed { 1294 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed) 1295 } 1296 1297 // Task One should be killed 1298 state1 := last.TaskStates[task.Name] 1299 if state1.State != structs.TaskStateDead { 1300 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1301 } 1302 if len(state1.Events) < 2 { 1303 // At least have a received and destroyed 1304 return false, fmt.Errorf("Unexpected number of events") 1305 } 1306 1307 found := false 1308 for _, e := range state1.Events { 1309 if e.Type != structs.TaskSiblingFailed { 1310 found = true 1311 } 1312 } 1313 1314 if !found { 1315 return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed) 1316 } 1317 1318 // Task Two should be failed 1319 state2 := last.TaskStates[task2.Name] 1320 if state2.State != structs.TaskStateDead { 1321 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1322 } 1323 if !state2.Failed { 1324 return false, fmt.Errorf("task2 should have failed") 1325 } 1326 1327 return true, nil 1328 }, func(err error) { 1329 t.Fatalf("err: %v", err) 1330 }) 1331 } 1332 1333 func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { 1334 t.Parallel() 1335 upd, ar := testAllocRunner(t, false) 1336 1337 // Create two tasks in the task group 1338 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1339 task.Driver = "mock_driver" 1340 task.KillTimeout = 10 * time.Millisecond 1341 task.Config = map[string]interface{}{ 1342 "run_for": "10s", 1343 } 1344 1345 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1346 task2.Name = "task 2" 1347 task2.Driver = "mock_driver" 1348 task2.Leader = true 1349 task2.Config = map[string]interface{}{ 1350 "run_for": "1s", 1351 } 1352 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1353 ar.alloc.TaskResources[task2.Name] = task2.Resources 1354 go ar.Run() 1355 defer ar.Destroy() 1356 1357 testutil.WaitForResult(func() (bool, error) { 1358 last := upd.Last() 1359 if last == nil { 1360 return false, fmt.Errorf("No updates") 1361 } 1362 if last.ClientStatus != structs.AllocClientStatusComplete { 1363 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1364 } 1365 1366 // Task One should be killed 1367 state1 := last.TaskStates[task.Name] 1368 if state1.State != structs.TaskStateDead { 1369 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1370 } 1371 if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { 1372 return false, fmt.Errorf("expected to have a start and finish time") 1373 } 1374 if len(state1.Events) < 2 { 1375 // At least have a received and destroyed 1376 return false, fmt.Errorf("Unexpected number of events") 1377 } 1378 1379 found := false 1380 for _, e := range state1.Events { 1381 if e.Type != structs.TaskLeaderDead { 1382 found = true 1383 } 1384 } 1385 1386 if !found { 1387 return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead) 1388 } 1389 1390 // Task Two should be dead 1391 state2 := last.TaskStates[task2.Name] 1392 if state2.State != structs.TaskStateDead { 1393 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1394 } 1395 if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { 1396 return false, fmt.Errorf("expected to have a start and finish time") 1397 } 1398 1399 return true, nil 1400 }, func(err error) { 1401 t.Fatalf("err: %v", err) 1402 }) 1403 } 1404 1405 // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group 1406 // with a leader the leader is stopped before other tasks. 1407 func TestAllocRunner_TaskLeader_StopTG(t *testing.T) { 1408 t.Parallel() 1409 upd, ar := testAllocRunner(t, false) 1410 1411 // Create 3 tasks in the task group 1412 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1413 task.Name = "follower1" 1414 task.Driver = "mock_driver" 1415 task.KillTimeout = 10 * time.Millisecond 1416 task.Config = map[string]interface{}{ 1417 "run_for": "10s", 1418 } 1419 1420 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1421 task2.Name = "leader" 1422 task2.Driver = "mock_driver" 1423 task2.Leader = true 1424 task2.KillTimeout = 10 * time.Millisecond 1425 task2.Config = map[string]interface{}{ 1426 "run_for": "10s", 1427 } 1428 1429 task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1430 task3.Name = "follower2" 1431 task3.Driver = "mock_driver" 1432 task3.KillTimeout = 10 * time.Millisecond 1433 task3.Config = map[string]interface{}{ 1434 "run_for": "10s", 1435 } 1436 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3) 1437 ar.alloc.TaskResources[task2.Name] = task2.Resources 1438 defer ar.Destroy() 1439 1440 go ar.Run() 1441 1442 // Wait for tasks to start 1443 last := upd.Last() 1444 testutil.WaitForResult(func() (bool, error) { 1445 last = upd.Last() 1446 if last == nil { 1447 return false, fmt.Errorf("No updates") 1448 } 1449 if n := len(last.TaskStates); n != 3 { 1450 return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n) 1451 } 1452 for name, state := range last.TaskStates { 1453 if state.State != structs.TaskStateRunning { 1454 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 1455 } 1456 } 1457 return true, nil 1458 }, func(err error) { 1459 t.Fatalf("err: %v", err) 1460 }) 1461 1462 // Reset updates 1463 upd.mu.Lock() 1464 upd.Allocs = upd.Allocs[:0] 1465 upd.mu.Unlock() 1466 1467 // Stop alloc 1468 update := ar.Alloc() 1469 update.DesiredStatus = structs.AllocDesiredStatusStop 1470 ar.Update(update) 1471 1472 // Wait for tasks to stop 1473 testutil.WaitForResult(func() (bool, error) { 1474 last := upd.Last() 1475 if last == nil { 1476 return false, fmt.Errorf("No updates") 1477 } 1478 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() { 1479 return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s", 1480 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt) 1481 } 1482 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() { 1483 return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s", 1484 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt) 1485 } 1486 return true, nil 1487 }, func(err error) { 1488 last := upd.Last() 1489 for name, state := range last.TaskStates { 1490 t.Logf("%s: %s", name, state.State) 1491 } 1492 t.Fatalf("err: %v", err) 1493 }) 1494 } 1495 1496 // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a 1497 // restored task group with a leader that failed before restoring the leader is 1498 // not stopped as it does not exist. 1499 // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932 1500 func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) { 1501 t.Parallel() 1502 _, ar := testAllocRunner(t, false) 1503 defer ar.Destroy() 1504 1505 // Create a leader and follower task in the task group 1506 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1507 task.Name = "follower1" 1508 task.Driver = "mock_driver" 1509 task.KillTimeout = 10 * time.Second 1510 task.Config = map[string]interface{}{ 1511 "run_for": "10s", 1512 } 1513 1514 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1515 task2.Name = "leader" 1516 task2.Driver = "mock_driver" 1517 task2.Leader = true 1518 task2.KillTimeout = 10 * time.Millisecond 1519 task2.Config = map[string]interface{}{ 1520 "run_for": "0s", 1521 } 1522 1523 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1524 ar.alloc.TaskResources[task2.Name] = task2.Resources 1525 1526 // Mimic Nomad exiting before the leader stopping is able to stop other tasks. 1527 ar.tasks = map[string]*TaskRunner{ 1528 "leader": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState, 1529 ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(), 1530 ar.vaultClient, ar.consulClient), 1531 "follower1": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState, 1532 ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(), 1533 ar.vaultClient, ar.consulClient), 1534 } 1535 ar.taskStates = map[string]*structs.TaskState{ 1536 "leader": {State: structs.TaskStateDead}, 1537 "follower1": {State: structs.TaskStateRunning}, 1538 } 1539 if err := ar.SaveState(); err != nil { 1540 t.Fatalf("error saving state: %v", err) 1541 } 1542 1543 // Create a new AllocRunner to test RestoreState and Run 1544 upd2 := &MockAllocStateUpdater{} 1545 ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc, 1546 ar.vaultClient, ar.consulClient, ar.prevAlloc) 1547 defer ar2.Destroy() 1548 1549 if err := ar2.RestoreState(); err != nil { 1550 t.Fatalf("error restoring state: %v", err) 1551 } 1552 go ar2.Run() 1553 1554 // Wait for tasks to be stopped because leader is dead 1555 testutil.WaitForResult(func() (bool, error) { 1556 last := upd2.Last() 1557 if last == nil { 1558 return false, fmt.Errorf("No updates") 1559 } 1560 if actual := last.TaskStates["leader"].State; actual != structs.TaskStateDead { 1561 return false, fmt.Errorf("Task leader is not dead yet (it's %q)", actual) 1562 } 1563 if actual := last.TaskStates["follower1"].State; actual != structs.TaskStateDead { 1564 return false, fmt.Errorf("Task follower1 is not dead yet (it's %q)", actual) 1565 } 1566 return true, nil 1567 }, func(err error) { 1568 last := upd2.Last() 1569 for name, state := range last.TaskStates { 1570 t.Logf("%s: %s", name, state.State) 1571 } 1572 t.Fatalf("err: %v", err) 1573 }) 1574 1575 // Make sure it GCs properly 1576 ar2.Destroy() 1577 1578 select { 1579 case <-ar2.WaitCh(): 1580 // exited as expected 1581 case <-time.After(10 * time.Second): 1582 t.Fatalf("timed out waiting for AR to GC") 1583 } 1584 } 1585 1586 // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's 1587 // local/ dir will be moved to a replacement alloc's local/ dir if sticky 1588 // volumes is on. 1589 func TestAllocRunner_MoveAllocDir(t *testing.T) { 1590 t.Parallel() 1591 // Create an alloc runner 1592 alloc := mock.Alloc() 1593 task := alloc.Job.TaskGroups[0].Tasks[0] 1594 task.Driver = "mock_driver" 1595 task.Config = map[string]interface{}{ 1596 "run_for": "1s", 1597 } 1598 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 1599 go ar.Run() 1600 defer ar.Destroy() 1601 1602 testutil.WaitForResult(func() (bool, error) { 1603 last := upd.Last() 1604 if last == nil { 1605 return false, fmt.Errorf("No updates") 1606 } 1607 if last.ClientStatus != structs.AllocClientStatusComplete { 1608 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1609 } 1610 return true, nil 1611 }, func(err error) { 1612 t.Fatalf("err: %v", err) 1613 }) 1614 1615 // Write some data in data dir and task dir of the alloc 1616 dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file") 1617 ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm) 1618 taskDir := ar.allocDir.TaskDirs[task.Name] 1619 taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file") 1620 ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm) 1621 1622 // Create another alloc runner 1623 alloc2 := mock.Alloc() 1624 alloc2.PreviousAllocation = ar.allocID 1625 alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true 1626 task = alloc2.Job.TaskGroups[0].Tasks[0] 1627 task.Driver = "mock_driver" 1628 task.Config = map[string]interface{}{ 1629 "run_for": "1s", 1630 } 1631 upd2, ar2 := testAllocRunnerFromAlloc(t, alloc2, false) 1632 1633 // Set prevAlloc like Client does 1634 ar2.prevAlloc = newAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "") 1635 1636 go ar2.Run() 1637 defer ar2.Destroy() 1638 1639 testutil.WaitForResult(func() (bool, error) { 1640 last := upd2.Last() 1641 if last == nil { 1642 return false, fmt.Errorf("No updates") 1643 } 1644 if last.ClientStatus != structs.AllocClientStatusComplete { 1645 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1646 } 1647 return true, nil 1648 }, func(err error) { 1649 t.Fatalf("err: %v", err) 1650 }) 1651 1652 // Ensure that data from ar was moved to ar2 1653 taskDir = ar2.allocDir.TaskDirs[task.Name] 1654 taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file") 1655 if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil { 1656 t.Fatalf("file %v not found", taskLocalFile) 1657 } 1658 1659 dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file") 1660 if fileInfo, _ := os.Stat(dataFile); fileInfo == nil { 1661 t.Fatalf("file %v not found", dataFile) 1662 } 1663 }