github.com/emate/nomad@v0.8.2-wo-binpacking/client/alloc_runner_test.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "path/filepath" 8 "strings" 9 "sync" 10 "testing" 11 "text/template" 12 "time" 13 14 "github.com/boltdb/bolt" 15 "github.com/hashicorp/consul/api" 16 "github.com/hashicorp/go-multierror" 17 "github.com/hashicorp/nomad/command/agent/consul" 18 "github.com/hashicorp/nomad/helper/testlog" 19 "github.com/hashicorp/nomad/helper/uuid" 20 "github.com/hashicorp/nomad/nomad/mock" 21 "github.com/hashicorp/nomad/nomad/structs" 22 "github.com/hashicorp/nomad/testutil" 23 "github.com/hashicorp/nomad/version" 24 "github.com/kr/pretty" 25 "github.com/stretchr/testify/assert" 26 27 "github.com/hashicorp/nomad/client/config" 28 "github.com/hashicorp/nomad/client/vaultclient" 29 "github.com/stretchr/testify/require" 30 ) 31 32 type MockAllocStateUpdater struct { 33 Allocs []*structs.Allocation 34 mu sync.Mutex 35 } 36 37 // Update fulfills the TaskStateUpdater interface 38 func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) { 39 m.mu.Lock() 40 m.Allocs = append(m.Allocs, alloc) 41 m.mu.Unlock() 42 } 43 44 // Last returns a copy of the last alloc (or nil) sync'd 45 func (m *MockAllocStateUpdater) Last() *structs.Allocation { 46 m.mu.Lock() 47 defer m.mu.Unlock() 48 n := len(m.Allocs) 49 if n == 0 { 50 return nil 51 } 52 return m.Allocs[n-1].Copy() 53 } 54 55 // allocationBucketExists checks if the allocation bucket was created. 56 func allocationBucketExists(tx *bolt.Tx, allocID string) bool { 57 allocations := tx.Bucket(allocationsBucket) 58 if allocations == nil { 59 return false 60 } 61 62 // Retrieve the specific allocations bucket 63 alloc := allocations.Bucket([]byte(allocID)) 64 return alloc != nil 65 } 66 67 func testAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { 68 conf := config.DefaultConfig() 69 conf.Node = mock.Node() 70 conf.StateDir = os.TempDir() 71 conf.AllocDir = os.TempDir() 72 tmp, _ := ioutil.TempFile("", "state-db") 73 db, _ := bolt.Open(tmp.Name(), 0600, nil) 74 upd := &MockAllocStateUpdater{} 75 if !restarts { 76 *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} 77 alloc.Job.Type = structs.JobTypeBatch 78 } 79 vclient := vaultclient.NewMockVaultClient() 80 ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, newMockConsulServiceClient(t), noopPrevAlloc{}) 81 return upd, ar 82 } 83 84 func testAllocRunner(t *testing.T, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { 85 // Use mock driver 86 alloc := mock.Alloc() 87 task := alloc.Job.TaskGroups[0].Tasks[0] 88 task.Driver = "mock_driver" 89 task.Config["run_for"] = "500ms" 90 return testAllocRunnerFromAlloc(t, alloc, restarts) 91 } 92 93 func TestAllocRunner_SimpleRun(t *testing.T) { 94 t.Parallel() 95 upd, ar := testAllocRunner(t, false) 96 go ar.Run() 97 defer ar.Destroy() 98 99 testutil.WaitForResult(func() (bool, error) { 100 last := upd.Last() 101 if last == nil { 102 return false, fmt.Errorf("No updates") 103 } 104 if last.ClientStatus != structs.AllocClientStatusComplete { 105 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 106 } 107 return true, nil 108 }, func(err error) { 109 t.Fatalf("err: %v", err) 110 }) 111 } 112 113 // Test that FinisheAt is set when the alloc is in a terminal state 114 func TestAllocRunner_FinishedAtSet(t *testing.T) { 115 t.Parallel() 116 require := require.New(t) 117 _, ar := testAllocRunner(t, false) 118 ar.allocClientStatus = structs.AllocClientStatusFailed 119 alloc := ar.Alloc() 120 taskFinishedAt := make(map[string]time.Time) 121 require.NotEmpty(alloc.TaskStates) 122 for name, s := range alloc.TaskStates { 123 require.False(s.FinishedAt.IsZero()) 124 taskFinishedAt[name] = s.FinishedAt 125 } 126 127 // Verify that calling again should not mutate finishedAt 128 alloc2 := ar.Alloc() 129 for name, s := range alloc2.TaskStates { 130 require.Equal(taskFinishedAt[name], s.FinishedAt) 131 } 132 133 } 134 135 // Test that FinisheAt is set when the alloc is in a terminal state 136 func TestAllocRunner_FinishedAtSet_TaskEvents(t *testing.T) { 137 t.Parallel() 138 require := require.New(t) 139 _, ar := testAllocRunner(t, false) 140 ar.taskStates[ar.alloc.Job.TaskGroups[0].Tasks[0].Name] = &structs.TaskState{State: structs.TaskStateDead, Failed: true} 141 142 alloc := ar.Alloc() 143 taskFinishedAt := make(map[string]time.Time) 144 require.NotEmpty(alloc.TaskStates) 145 for name, s := range alloc.TaskStates { 146 require.False(s.FinishedAt.IsZero()) 147 taskFinishedAt[name] = s.FinishedAt 148 } 149 150 // Verify that calling again should not mutate finishedAt 151 alloc2 := ar.Alloc() 152 for name, s := range alloc2.TaskStates { 153 require.Equal(taskFinishedAt[name], s.FinishedAt) 154 } 155 156 } 157 158 // Test that the watcher will mark the allocation as unhealthy. 159 func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) { 160 t.Parallel() 161 assert := assert.New(t) 162 163 // Ensure the task fails and restarts 164 upd, ar := testAllocRunner(t, true) 165 166 // Make the task fail 167 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 168 task.Driver = "mock_driver" 169 task.Config["start_error"] = "test error" 170 171 // Make the alloc be part of a deployment 172 ar.alloc.DeploymentID = uuid.Generate() 173 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 174 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 175 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 176 177 go ar.Run() 178 defer ar.Destroy() 179 180 testutil.WaitForResult(func() (bool, error) { 181 last := upd.Last() 182 if last == nil { 183 return false, fmt.Errorf("No updates") 184 } 185 if !last.DeploymentStatus.HasHealth() { 186 return false, fmt.Errorf("want deployment status unhealthy; got unset") 187 } else if *last.DeploymentStatus.Healthy { 188 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 189 } 190 return true, nil 191 }, func(err error) { 192 t.Fatalf("err: %v", err) 193 }) 194 195 // Assert that we have an event explaining why we are unhealthy. 196 assert.Len(ar.taskStates, 1) 197 state := ar.taskStates[task.Name] 198 assert.NotNil(state) 199 assert.NotEmpty(state.Events) 200 last := state.Events[len(state.Events)-1] 201 assert.Equal(allocHealthEventSource, last.Type) 202 assert.Contains(last.Message, "failed task") 203 } 204 205 // Test that the watcher will mark the allocation as unhealthy if it hits its 206 // deadline. 207 func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) { 208 t.Parallel() 209 210 // Don't restart but force service job type 211 upd, ar := testAllocRunner(t, false) 212 ar.alloc.Job.Type = structs.JobTypeService 213 214 // Make the task block 215 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 216 task.Driver = "mock_driver" 217 task.Config["start_block_for"] = "4s" 218 task.Config["run_for"] = "10s" 219 220 // Make the alloc be part of a deployment 221 ar.alloc.DeploymentID = uuid.Generate() 222 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 223 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 224 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 225 ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond 226 227 go ar.Run() 228 defer ar.Destroy() 229 230 testutil.WaitForResult(func() (bool, error) { 231 last := upd.Last() 232 if last == nil { 233 return false, fmt.Errorf("No updates") 234 } 235 236 // Assert alloc is unhealthy 237 if !last.DeploymentStatus.HasHealth() { 238 return false, fmt.Errorf("want deployment status unhealthy; got unset") 239 } else if *last.DeploymentStatus.Healthy { 240 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 241 } 242 243 // Assert there is a task event explaining why we are unhealthy. 244 state, ok := last.TaskStates[task.Name] 245 if !ok { 246 return false, fmt.Errorf("missing state for task %s", task.Name) 247 } 248 n := len(state.Events) 249 if n == 0 { 250 return false, fmt.Errorf("no task events") 251 } 252 lastEvent := state.Events[n-1] 253 if lastEvent.Type != allocHealthEventSource { 254 return false, fmt.Errorf("expected %q; found %q", allocHealthEventSource, lastEvent.Type) 255 } 256 if !strings.Contains(lastEvent.Message, "not running by deadline") { 257 return false, fmt.Errorf(`expected "not running by deadline" but found: %s`, lastEvent.Message) 258 } 259 260 return true, nil 261 }, func(err error) { 262 t.Fatalf("err: %v", err) 263 }) 264 } 265 266 // Test that the watcher will mark the allocation as healthy. 267 func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { 268 t.Parallel() 269 270 // Ensure the task fails and restarts 271 upd, ar := testAllocRunner(t, true) 272 273 // Make the task run healthy 274 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 275 task.Driver = "mock_driver" 276 task.Config["run_for"] = "10s" 277 278 // Create a task that takes longer to become healthy 279 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy()) 280 task2 := ar.alloc.Job.TaskGroups[0].Tasks[1] 281 task2.Name = "task 2" 282 task2.Config["start_block_for"] = "500ms" 283 284 // Make the alloc be part of a deployment 285 ar.alloc.DeploymentID = uuid.Generate() 286 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 287 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 288 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 289 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 290 291 start := time.Now() 292 go ar.Run() 293 defer ar.Destroy() 294 295 testutil.WaitForResult(func() (bool, error) { 296 last := upd.Last() 297 if last == nil { 298 return false, fmt.Errorf("No updates") 299 } 300 if !last.DeploymentStatus.HasHealth() { 301 return false, fmt.Errorf("want deployment status unhealthy; got unset") 302 } else if !*last.DeploymentStatus.Healthy { 303 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 304 } 305 return true, nil 306 }, func(err error) { 307 t.Fatalf("err: %v", err) 308 }) 309 if d := time.Now().Sub(start); d < 500*time.Millisecond { 310 t.Fatalf("didn't wait for second task group. Only took %v", d) 311 } 312 } 313 314 // Test that the watcher will mark the allocation as healthy with checks 315 func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) { 316 t.Parallel() 317 318 // Ensure the task fails and restarts 319 upd, ar := testAllocRunner(t, true) 320 321 // Make the task fail 322 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 323 task.Driver = "mock_driver" 324 task.Config["run_for"] = "10s" 325 326 // Create a task that has no checks 327 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy()) 328 task2 := ar.alloc.Job.TaskGroups[0].Tasks[1] 329 task2.Name = "task 2" 330 task2.Services = nil 331 332 // Make the alloc be part of a deployment 333 ar.alloc.DeploymentID = uuid.Generate() 334 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 335 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 336 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 337 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 338 339 checkHealthy := &api.AgentCheck{ 340 CheckID: uuid.Generate(), 341 Status: api.HealthPassing, 342 } 343 checkUnhealthy := &api.AgentCheck{ 344 CheckID: checkHealthy.CheckID, 345 Status: api.HealthWarning, 346 } 347 348 // Only return the check as healthy after a duration 349 trigger := time.After(500 * time.Millisecond) 350 ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 351 select { 352 case <-trigger: 353 return &consul.AllocRegistration{ 354 Tasks: map[string]*consul.TaskRegistration{ 355 task.Name: { 356 Services: map[string]*consul.ServiceRegistration{ 357 "123": { 358 Service: &api.AgentService{Service: "foo"}, 359 Checks: []*api.AgentCheck{checkHealthy}, 360 }, 361 }, 362 }, 363 }, 364 }, nil 365 default: 366 return &consul.AllocRegistration{ 367 Tasks: map[string]*consul.TaskRegistration{ 368 task.Name: { 369 Services: map[string]*consul.ServiceRegistration{ 370 "123": { 371 Service: &api.AgentService{Service: "foo"}, 372 Checks: []*api.AgentCheck{checkUnhealthy}, 373 }, 374 }, 375 }, 376 }, 377 }, nil 378 } 379 } 380 381 start := time.Now() 382 go ar.Run() 383 defer ar.Destroy() 384 385 testutil.WaitForResult(func() (bool, error) { 386 last := upd.Last() 387 if last == nil { 388 return false, fmt.Errorf("No updates") 389 } 390 if !last.DeploymentStatus.HasHealth() { 391 return false, fmt.Errorf("want deployment status unhealthy; got unset") 392 } else if !*last.DeploymentStatus.Healthy { 393 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 394 } 395 return true, nil 396 }, func(err error) { 397 t.Fatalf("err: %v", err) 398 }) 399 400 if d := time.Now().Sub(start); d < 500*time.Millisecond { 401 t.Fatalf("didn't wait for second task group. Only took %v", d) 402 } 403 } 404 405 // Test that the watcher will mark the allocation as unhealthy with failing 406 // checks 407 func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { 408 t.Parallel() 409 assert := assert.New(t) 410 411 // Ensure the task fails and restarts 412 upd, ar := testAllocRunner(t, true) 413 414 // Make the task fail 415 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 416 task.Driver = "mock_driver" 417 task.Config["run_for"] = "10s" 418 419 // Make the alloc be part of a deployment 420 ar.alloc.DeploymentID = uuid.Generate() 421 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 422 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 423 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 424 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 425 ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second 426 427 checkUnhealthy := &api.AgentCheck{ 428 CheckID: uuid.Generate(), 429 Status: api.HealthWarning, 430 } 431 432 // Only return the check as healthy after a duration 433 ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 434 return &consul.AllocRegistration{ 435 Tasks: map[string]*consul.TaskRegistration{ 436 task.Name: { 437 Services: map[string]*consul.ServiceRegistration{ 438 "123": { 439 Service: &api.AgentService{Service: "foo"}, 440 Checks: []*api.AgentCheck{checkUnhealthy}, 441 }, 442 }, 443 }, 444 }, 445 }, nil 446 } 447 448 go ar.Run() 449 defer ar.Destroy() 450 451 testutil.WaitForResult(func() (bool, error) { 452 last := upd.Last() 453 if last == nil { 454 return false, fmt.Errorf("No updates") 455 } 456 if !last.DeploymentStatus.HasHealth() { 457 return false, fmt.Errorf("want deployment status unhealthy; got unset") 458 } else if *last.DeploymentStatus.Healthy { 459 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 460 } 461 return true, nil 462 }, func(err error) { 463 t.Fatalf("err: %v", err) 464 }) 465 466 // Assert that we have an event explaining why we are unhealthy. 467 assert.Len(ar.taskStates, 1) 468 state := ar.taskStates[task.Name] 469 assert.NotNil(state) 470 assert.NotEmpty(state.Events) 471 last := state.Events[len(state.Events)-1] 472 assert.Equal(allocHealthEventSource, last.Type) 473 assert.Contains(last.Message, "Services not healthy by deadline") 474 } 475 476 // Test that the watcher will mark the allocation as healthy. 477 func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) { 478 t.Parallel() 479 480 // Ensure the task fails and restarts 481 upd, ar := testAllocRunner(t, true) 482 483 // Make the task run healthy 484 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 485 task.Driver = "mock_driver" 486 task.Config["run_for"] = "30s" 487 488 // Make the alloc be part of a deployment 489 ar.alloc.DeploymentID = uuid.Generate() 490 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 491 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 492 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 493 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 494 495 go ar.Run() 496 defer ar.Destroy() 497 498 testutil.WaitForResult(func() (bool, error) { 499 last := upd.Last() 500 if last == nil { 501 return false, fmt.Errorf("No updates") 502 } 503 if !last.DeploymentStatus.HasHealth() { 504 return false, fmt.Errorf("want deployment status unhealthy; got unset") 505 } else if !*last.DeploymentStatus.Healthy { 506 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 507 } 508 return true, nil 509 }, func(err error) { 510 t.Fatalf("err: %v", err) 511 }) 512 513 // Mimick an update to a new deployment id 514 last := upd.Last() 515 last.DeploymentStatus = nil 516 last.DeploymentID = uuid.Generate() 517 ar.Update(last) 518 519 testutil.WaitForResult(func() (bool, error) { 520 last := upd.Last() 521 if !last.DeploymentStatus.HasHealth() { 522 return false, fmt.Errorf("want deployment status unhealthy; got unset") 523 } else if !*last.DeploymentStatus.Healthy { 524 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 525 } 526 return true, nil 527 }, func(err error) { 528 t.Fatalf("err: %v", err) 529 }) 530 } 531 532 // Test that health is reported for services that got migrated; not just part 533 // of deployments. 534 func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) { 535 t.Parallel() 536 537 // Ensure the task fails and restarts 538 upd, ar := testAllocRunner(t, true) 539 540 // Make the task run healthy 541 tg := ar.alloc.Job.TaskGroups[0] 542 task := tg.Tasks[0] 543 task.Driver = "mock_driver" 544 task.Config["run_for"] = "30s" 545 546 // Shorten the default migration healthy time 547 tg.Migrate = structs.DefaultMigrateStrategy() 548 tg.Migrate.MinHealthyTime = 100 * time.Millisecond 549 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 550 551 // Ensure the alloc is *not* part of a deployment 552 ar.alloc.DeploymentID = "" 553 554 go ar.Run() 555 defer ar.Destroy() 556 557 testutil.WaitForResult(func() (bool, error) { 558 last := upd.Last() 559 if last == nil { 560 return false, fmt.Errorf("No updates") 561 } 562 if !last.DeploymentStatus.HasHealth() { 563 return false, fmt.Errorf("want deployment status unhealthy; got unset") 564 } else if !*last.DeploymentStatus.Healthy { 565 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 566 } 567 return true, nil 568 }, func(err error) { 569 t.Fatalf("err: %v", err) 570 }) 571 } 572 573 // Test that health is *not* reported for batch jobs 574 func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) { 575 t.Parallel() 576 577 // Ensure the task fails and restarts 578 alloc := mock.BatchAlloc() 579 tg := alloc.Job.TaskGroups[0] 580 581 // This should not be possile as validation should prevent batch jobs 582 // from having a migration stanza! 583 tg.Migrate = structs.DefaultMigrateStrategy() 584 tg.Migrate.MinHealthyTime = 1 * time.Millisecond 585 tg.Migrate.HealthyDeadline = 2 * time.Millisecond 586 tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates 587 588 task := tg.Tasks[0] 589 task.Driver = "mock_driver" 590 task.Config["run_for"] = "5s" 591 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 592 593 go ar.Run() 594 defer ar.Destroy() 595 596 testutil.WaitForResult(func() (bool, error) { 597 last := upd.Last() 598 if last == nil { 599 return false, fmt.Errorf("No updates") 600 } 601 if last.DeploymentStatus != nil { 602 return false, fmt.Errorf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy) 603 } 604 return true, nil 605 }, func(err error) { 606 t.Fatalf("err: %v", err) 607 }) 608 } 609 610 // TestAllocRuner_RetryArtifact ensures that if one task in a task group is 611 // retrying fetching an artifact, other tasks in the group should be able 612 // to proceed. 613 func TestAllocRunner_RetryArtifact(t *testing.T) { 614 t.Parallel() 615 616 alloc := mock.Alloc() 617 alloc.Job.Type = structs.JobTypeBatch 618 alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail 619 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1 620 alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second 621 622 task := alloc.Job.TaskGroups[0].Tasks[0] 623 task.Driver = "mock_driver" 624 task.Config = map[string]interface{}{ 625 "exit_code": "0", 626 "run_for": "1s", 627 } 628 629 // Create a new task with a bad artifact 630 badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() 631 badtask.Name = "bad" 632 badtask.Artifacts = []*structs.TaskArtifact{ 633 {GetterSource: "http://127.0.0.1:0/foo/bar/baz"}, 634 } 635 636 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) 637 upd, ar := testAllocRunnerFromAlloc(t, alloc, true) 638 go ar.Run() 639 defer ar.Destroy() 640 641 testutil.WaitForResult(func() (bool, error) { 642 last := upd.Last() 643 if last == nil { 644 return false, fmt.Errorf("No updates") 645 } 646 647 // web task should have completed successfully while bad task 648 // retries artifact fetching 649 webstate, ok := last.TaskStates["web"] 650 if !ok { 651 return false, fmt.Errorf("no task state for web") 652 } 653 if webstate.State != structs.TaskStateDead { 654 return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State) 655 } 656 if !webstate.Successful() { 657 return false, fmt.Errorf("expected web to have exited successfully") 658 } 659 660 // bad task should have failed 661 badstate := last.TaskStates["bad"] 662 if badstate.State != structs.TaskStateDead { 663 return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State) 664 } 665 if !badstate.Failed { 666 return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events) 667 } 668 return true, nil 669 }, func(err error) { 670 t.Fatalf("err: %v", err) 671 }) 672 } 673 674 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { 675 t.Parallel() 676 upd, ar := testAllocRunner(t, false) 677 678 // Ensure task takes some time 679 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 680 task.Driver = "mock_driver" 681 task.Config["run_for"] = "10s" 682 go ar.Run() 683 684 testutil.WaitForResult(func() (bool, error) { 685 last := upd.Last() 686 if last == nil { 687 return false, fmt.Errorf("No updates") 688 } 689 if last.ClientStatus != structs.AllocClientStatusRunning { 690 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 691 } 692 return true, nil 693 }, func(err error) { 694 t.Fatalf("err: %v", err) 695 }) 696 697 // Update the alloc to be terminal which should cause the alloc runner to 698 // stop the tasks and wait for a destroy. 699 update := ar.alloc.Copy() 700 update.DesiredStatus = structs.AllocDesiredStatusStop 701 ar.Update(update) 702 703 testutil.WaitForResult(func() (bool, error) { 704 last := upd.Last() 705 if last == nil { 706 return false, fmt.Errorf("No updates") 707 } 708 709 // Check the status has changed. 710 if last.ClientStatus != structs.AllocClientStatusComplete { 711 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 712 } 713 714 // Check the allocation state still exists 715 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 716 if !allocationBucketExists(tx, ar.Alloc().ID) { 717 return fmt.Errorf("no bucket for alloc") 718 } 719 720 return nil 721 }); err != nil { 722 return false, fmt.Errorf("state destroyed") 723 } 724 725 // Check the alloc directory still exists 726 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 727 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 728 } 729 730 return true, nil 731 }, func(err error) { 732 t.Fatalf("err: %v", err) 733 }) 734 735 // Send the destroy signal and ensure the AllocRunner cleans up. 736 ar.Destroy() 737 738 testutil.WaitForResult(func() (bool, error) { 739 last := upd.Last() 740 if last == nil { 741 return false, fmt.Errorf("No updates") 742 } 743 744 // Check the status has changed. 745 if last.ClientStatus != structs.AllocClientStatusComplete { 746 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 747 } 748 749 // Check the state was cleaned 750 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 751 if allocationBucketExists(tx, ar.Alloc().ID) { 752 return fmt.Errorf("bucket for alloc exists") 753 } 754 755 return nil 756 }); err != nil { 757 return false, fmt.Errorf("state not destroyed") 758 } 759 760 // Check the alloc directory was cleaned 761 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 762 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 763 } else if !os.IsNotExist(err) { 764 return false, fmt.Errorf("stat err: %v", err) 765 } 766 767 return true, nil 768 }, func(err error) { 769 t.Fatalf("err: %v", err) 770 }) 771 } 772 773 func TestAllocRunner_Destroy(t *testing.T) { 774 t.Parallel() 775 upd, ar := testAllocRunner(t, false) 776 777 // Ensure task takes some time 778 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 779 task.Driver = "mock_driver" 780 task.Config["run_for"] = "10s" 781 go ar.Run() 782 start := time.Now() 783 784 // Begin the tear down 785 go func() { 786 time.Sleep(1 * time.Second) 787 ar.Destroy() 788 }() 789 790 testutil.WaitForResult(func() (bool, error) { 791 last := upd.Last() 792 if last == nil { 793 return false, fmt.Errorf("No updates") 794 } 795 796 // Check the status has changed. 797 if last.ClientStatus != structs.AllocClientStatusComplete { 798 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 799 } 800 801 // Check the state was cleaned 802 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 803 if allocationBucketExists(tx, ar.Alloc().ID) { 804 return fmt.Errorf("bucket for alloc exists") 805 } 806 807 return nil 808 }); err != nil { 809 return false, fmt.Errorf("state not destroyed: %v", err) 810 } 811 812 // Check the alloc directory was cleaned 813 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 814 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 815 } else if !os.IsNotExist(err) { 816 return false, fmt.Errorf("stat err: %v", err) 817 } 818 819 return true, nil 820 }, func(err error) { 821 t.Fatalf("err: %v", err) 822 }) 823 824 if elapsed := time.Since(start); elapsed > 20*time.Second { 825 t.Fatalf("took too long to terminate: %s", elapsed) 826 } 827 } 828 829 func TestAllocRunner_Update(t *testing.T) { 830 t.Parallel() 831 _, ar := testAllocRunner(t, false) 832 833 // Deep copy the alloc to avoid races when updating 834 newAlloc := ar.Alloc().Copy() 835 836 // Ensure task takes some time 837 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 838 task.Driver = "mock_driver" 839 task.Config["run_for"] = "10s" 840 go ar.Run() 841 defer ar.Destroy() 842 843 // Update the alloc definition 844 newAlloc.Name = "FOO" 845 newAlloc.AllocModifyIndex++ 846 ar.Update(newAlloc) 847 848 // Check the alloc runner stores the update allocation. 849 testutil.WaitForResult(func() (bool, error) { 850 return ar.Alloc().Name == "FOO", nil 851 }, func(err error) { 852 t.Fatalf("err: %v %#v", err, ar.Alloc()) 853 }) 854 } 855 856 func TestAllocRunner_SaveRestoreState(t *testing.T) { 857 t.Parallel() 858 alloc := mock.Alloc() 859 task := alloc.Job.TaskGroups[0].Tasks[0] 860 task.Driver = "mock_driver" 861 task.Config = map[string]interface{}{ 862 "exit_code": "0", 863 "run_for": "10s", 864 } 865 866 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 867 go ar.Run() 868 defer ar.Destroy() 869 870 // Snapshot state 871 testutil.WaitForResult(func() (bool, error) { 872 ar.taskLock.RLock() 873 defer ar.taskLock.RUnlock() 874 return len(ar.tasks) == 1, nil 875 }, func(err error) { 876 t.Fatalf("task never started: %v", err) 877 }) 878 879 err := ar.SaveState() 880 if err != nil { 881 t.Fatalf("err: %v", err) 882 } 883 884 // Create a new alloc runner 885 l2 := prefixedTestLogger("----- ar2: ") 886 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 887 prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "") 888 ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update, 889 alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 890 err = ar2.RestoreState() 891 if err != nil { 892 t.Fatalf("err: %v", err) 893 } 894 go ar2.Run() 895 896 testutil.WaitForResult(func() (bool, error) { 897 if len(ar2.tasks) != 1 { 898 return false, fmt.Errorf("Incorrect number of tasks") 899 } 900 901 last := upd.Last() 902 if last == nil { 903 return false, nil 904 } 905 906 return last.ClientStatus == structs.AllocClientStatusRunning, nil 907 }, func(err error) { 908 last := upd.Last() 909 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"]) 910 }) 911 912 // Destroy and wait 913 ar2.Destroy() 914 start := time.Now() 915 916 testutil.WaitForResult(func() (bool, error) { 917 alloc := ar2.Alloc() 918 if alloc.ClientStatus != structs.AllocClientStatusComplete { 919 return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete) 920 } 921 return true, nil 922 }, func(err error) { 923 last := upd.Last() 924 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 925 }) 926 927 if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second { 928 t.Fatalf("took too long to terminate") 929 } 930 } 931 932 func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) { 933 t.Parallel() 934 upd, ar := testAllocRunner(t, false) 935 ar.logger = prefixedTestLogger("ar1: ") 936 937 // Ensure task takes some time 938 ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 939 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 940 task.Config["run_for"] = "10s" 941 go ar.Run() 942 defer ar.Destroy() 943 944 testutil.WaitForResult(func() (bool, error) { 945 last := upd.Last() 946 if last == nil { 947 return false, fmt.Errorf("No updates") 948 } 949 950 if last.ClientStatus != structs.AllocClientStatusRunning { 951 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 952 } 953 return true, nil 954 }, func(err error) { 955 t.Fatalf("err: %v", err) 956 }) 957 958 // Update the alloc to be terminal which should cause the alloc runner to 959 // stop the tasks and wait for a destroy. 960 update := ar.alloc.Copy() 961 update.DesiredStatus = structs.AllocDesiredStatusStop 962 ar.Update(update) 963 964 testutil.WaitForResult(func() (bool, error) { 965 return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil 966 }, func(err error) { 967 t.Fatalf("err: %v", err) 968 }) 969 970 err := ar.SaveState() 971 if err != nil { 972 t.Fatalf("err: %v", err) 973 } 974 975 // Ensure ar1 doesn't recreate the state file 976 ar.allocLock.Lock() 977 defer ar.allocLock.Unlock() 978 979 // Create a new alloc runner 980 l2 := prefixedTestLogger("ar2: ") 981 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 982 prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "") 983 ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update, 984 alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 985 err = ar2.RestoreState() 986 if err != nil { 987 t.Fatalf("err: %v", err) 988 } 989 ar2.logger.Println("[TESTING] running second alloc runner") 990 go ar2.Run() 991 defer ar2.Destroy() // Just-in-case of failure before Destroy below 992 993 testutil.WaitForResult(func() (bool, error) { 994 // Check the state still exists 995 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 996 if !allocationBucketExists(tx, ar2.Alloc().ID) { 997 return fmt.Errorf("no bucket for alloc") 998 } 999 1000 return nil 1001 }); err != nil { 1002 return false, fmt.Errorf("state destroyed") 1003 } 1004 1005 // Check the alloc directory still exists 1006 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 1007 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 1008 } 1009 1010 return true, nil 1011 }, func(err error) { 1012 last := upd.Last() 1013 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 1014 }) 1015 1016 // Send the destroy signal and ensure the AllocRunner cleans up. 1017 ar2.logger.Println("[TESTING] destroying second alloc runner") 1018 ar2.Destroy() 1019 1020 testutil.WaitForResult(func() (bool, error) { 1021 last := upd.Last() 1022 if last == nil { 1023 return false, fmt.Errorf("No updates") 1024 } 1025 1026 // Check the status has changed. 1027 if last.ClientStatus != structs.AllocClientStatusComplete { 1028 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1029 } 1030 1031 // Check the state was cleaned 1032 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 1033 if allocationBucketExists(tx, ar2.Alloc().ID) { 1034 return fmt.Errorf("bucket for alloc exists") 1035 } 1036 1037 return nil 1038 }); err != nil { 1039 return false, fmt.Errorf("state not destroyed") 1040 } 1041 1042 // Check the alloc directory was cleaned 1043 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 1044 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 1045 } else if !os.IsNotExist(err) { 1046 return false, fmt.Errorf("stat err: %v", err) 1047 } 1048 1049 return true, nil 1050 }, func(err error) { 1051 t.Fatalf("err: %v", err) 1052 }) 1053 } 1054 1055 // TestAllocRunner_SaveRestoreState_Upgrade asserts that pre-0.6 exec tasks are 1056 // restarted on upgrade. 1057 func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) { 1058 t.Parallel() 1059 alloc := mock.Alloc() 1060 task := alloc.Job.TaskGroups[0].Tasks[0] 1061 task.Driver = "mock_driver" 1062 task.Config = map[string]interface{}{ 1063 "exit_code": "0", 1064 "run_for": "10s", 1065 } 1066 1067 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 1068 // Hack in old version to cause an upgrade on RestoreState 1069 origConfig := ar.config.Copy() 1070 ar.config.Version = &version.VersionInfo{Version: "0.5.6"} 1071 go ar.Run() 1072 defer ar.Destroy() 1073 1074 // Snapshot state 1075 testutil.WaitForResult(func() (bool, error) { 1076 last := upd.Last() 1077 if last == nil { 1078 return false, fmt.Errorf("No updates") 1079 } 1080 1081 if last.ClientStatus != structs.AllocClientStatusRunning { 1082 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 1083 } 1084 return true, nil 1085 }, func(err error) { 1086 t.Fatalf("task never started: %v", err) 1087 }) 1088 1089 err := ar.SaveState() 1090 if err != nil { 1091 t.Fatalf("err: %v", err) 1092 } 1093 1094 // Create a new alloc runner 1095 l2 := prefixedTestLogger("ar2: ") 1096 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 1097 prevAlloc := newAllocWatcher(alloc2, ar, nil, origConfig, l2, "") 1098 ar2 := NewAllocRunner(l2, origConfig, ar.stateDB, upd.Update, alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 1099 err = ar2.RestoreState() 1100 if err != nil { 1101 t.Fatalf("err: %v", err) 1102 } 1103 go ar2.Run() 1104 defer ar2.Destroy() // Just-in-case of failure before Destroy below 1105 1106 testutil.WaitForResult(func() (bool, error) { 1107 last := upd.Last() 1108 if last == nil { 1109 return false, fmt.Errorf("No updates") 1110 } 1111 for _, ev := range last.TaskStates["web"].Events { 1112 if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) { 1113 return true, nil 1114 } 1115 } 1116 return false, fmt.Errorf("no restart with proper reason found") 1117 }, func(err error) { 1118 last := upd.Last() 1119 t.Fatalf("err: %v\nweb state: % #v", err, pretty.Formatter(last.TaskStates["web"])) 1120 }) 1121 1122 // Destroy and wait 1123 ar2.Destroy() 1124 start := time.Now() 1125 1126 testutil.WaitForResult(func() (bool, error) { 1127 alloc := ar2.Alloc() 1128 if alloc.ClientStatus != structs.AllocClientStatusComplete { 1129 return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete) 1130 } 1131 return true, nil 1132 }, func(err error) { 1133 last := upd.Last() 1134 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 1135 }) 1136 1137 if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second { 1138 t.Fatalf("took too long to terminate") 1139 } 1140 } 1141 1142 // Ensure pre-#2132 state files containing the Context struct are properly 1143 // migrated to the new format. 1144 // 1145 // Old Context State: 1146 // 1147 // "Context": { 1148 // "AllocDir": { 1149 // "AllocDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb", 1150 // "SharedDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/alloc", 1151 // "TaskDirs": { 1152 // "echo1": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/echo1" 1153 // } 1154 // }, 1155 // "AllocID": "2a54fcff-fc44-8d4f-e025-53c48e9cbbbb" 1156 // } 1157 func TestAllocRunner_RestoreOldState(t *testing.T) { 1158 t.Parallel() 1159 alloc := mock.Alloc() 1160 task := alloc.Job.TaskGroups[0].Tasks[0] 1161 task.Driver = "mock_driver" 1162 task.Config = map[string]interface{}{ 1163 "exit_code": "0", 1164 "run_for": "10s", 1165 } 1166 1167 logger := testLogger() 1168 conf := config.DefaultConfig() 1169 conf.Node = mock.Node() 1170 conf.StateDir = os.TempDir() 1171 conf.AllocDir = os.TempDir() 1172 tmp, err := ioutil.TempFile("", "state-db") 1173 if err != nil { 1174 t.Fatalf("error creating state db file: %v", err) 1175 } 1176 db, err := bolt.Open(tmp.Name(), 0600, nil) 1177 if err != nil { 1178 t.Fatalf("error creating state db: %v", err) 1179 } 1180 1181 if err := os.MkdirAll(filepath.Join(conf.StateDir, "alloc", alloc.ID), 0777); err != nil { 1182 t.Fatalf("error creating state dir: %v", err) 1183 } 1184 statePath := filepath.Join(conf.StateDir, "alloc", alloc.ID, "state.json") 1185 w, err := os.Create(statePath) 1186 if err != nil { 1187 t.Fatalf("error creating state file: %v", err) 1188 } 1189 tmplctx := &struct { 1190 AllocID string 1191 AllocDir string 1192 }{alloc.ID, conf.AllocDir} 1193 err = template.Must(template.New("test_state").Parse(`{ 1194 "Version": "0.5.1", 1195 "Alloc": { 1196 "ID": "{{ .AllocID }}", 1197 "Name": "example", 1198 "JobID": "example", 1199 "Job": { 1200 "ID": "example", 1201 "Name": "example", 1202 "Type": "batch", 1203 "TaskGroups": [ 1204 { 1205 "Name": "example", 1206 "Tasks": [ 1207 { 1208 "Name": "example", 1209 "Driver": "mock", 1210 "Config": { 1211 "exit_code": "0", 1212 "run_for": "10s" 1213 } 1214 } 1215 ] 1216 } 1217 ] 1218 }, 1219 "TaskGroup": "example", 1220 "DesiredStatus": "run", 1221 "ClientStatus": "running", 1222 "TaskStates": { 1223 "example": { 1224 "State": "running", 1225 "Failed": false, 1226 "Events": [] 1227 } 1228 } 1229 }, 1230 "Context": { 1231 "AllocDir": { 1232 "AllocDir": "{{ .AllocDir }}/{{ .AllocID }}", 1233 "SharedDir": "{{ .AllocDir }}/{{ .AllocID }}/alloc", 1234 "TaskDirs": { 1235 "example": "{{ .AllocDir }}/{{ .AllocID }}/example" 1236 } 1237 }, 1238 "AllocID": "{{ .AllocID }}" 1239 } 1240 }`)).Execute(w, tmplctx) 1241 if err != nil { 1242 t.Fatalf("error writing state file: %v", err) 1243 } 1244 w.Close() 1245 1246 upd := &MockAllocStateUpdater{} 1247 *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} 1248 alloc.Job.Type = structs.JobTypeBatch 1249 vclient := vaultclient.NewMockVaultClient() 1250 cclient := newMockConsulServiceClient(t) 1251 ar := NewAllocRunner(logger, conf, db, upd.Update, alloc, vclient, cclient, noopPrevAlloc{}) 1252 defer ar.Destroy() 1253 1254 // RestoreState should fail on the task state since we only test the 1255 // alloc state restoring. 1256 err = ar.RestoreState() 1257 if err == nil { 1258 t.Fatal("expected error restoring Task state") 1259 } 1260 merr, ok := err.(*multierror.Error) 1261 if !ok { 1262 t.Fatalf("expected RestoreState to return a multierror but found: %T -> %v", err, err) 1263 } 1264 if len(merr.Errors) != 1 { 1265 t.Fatalf("expected exactly 1 error from RestoreState but found: %d: %v", len(merr.Errors), err) 1266 } 1267 if expected := "failed to get task bucket"; !strings.Contains(merr.Errors[0].Error(), expected) { 1268 t.Fatalf("expected %q but got: %q", expected, merr.Errors[0].Error()) 1269 } 1270 1271 if err := ar.SaveState(); err != nil { 1272 t.Fatalf("error saving new state: %v", err) 1273 } 1274 } 1275 1276 func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { 1277 t.Parallel() 1278 upd, ar := testAllocRunner(t, false) 1279 1280 // Create two tasks in the task group 1281 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1282 task.Driver = "mock_driver" 1283 task.KillTimeout = 10 * time.Millisecond 1284 task.Config = map[string]interface{}{ 1285 "run_for": "10s", 1286 } 1287 1288 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1289 task2.Name = "task 2" 1290 task2.Driver = "mock_driver" 1291 task2.Config = map[string]interface{}{ 1292 "start_error": "fail task please", 1293 } 1294 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1295 ar.alloc.TaskResources[task2.Name] = task2.Resources 1296 go ar.Run() 1297 defer ar.Destroy() 1298 1299 testutil.WaitForResult(func() (bool, error) { 1300 last := upd.Last() 1301 if last == nil { 1302 return false, fmt.Errorf("No updates") 1303 } 1304 if last.ClientStatus != structs.AllocClientStatusFailed { 1305 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed) 1306 } 1307 1308 // Task One should be killed 1309 state1 := last.TaskStates[task.Name] 1310 if state1.State != structs.TaskStateDead { 1311 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1312 } 1313 if len(state1.Events) < 2 { 1314 // At least have a received and destroyed 1315 return false, fmt.Errorf("Unexpected number of events") 1316 } 1317 1318 found := false 1319 for _, e := range state1.Events { 1320 if e.Type != structs.TaskSiblingFailed { 1321 found = true 1322 } 1323 } 1324 1325 if !found { 1326 return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed) 1327 } 1328 1329 // Task Two should be failed 1330 state2 := last.TaskStates[task2.Name] 1331 if state2.State != structs.TaskStateDead { 1332 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1333 } 1334 if !state2.Failed { 1335 return false, fmt.Errorf("task2 should have failed") 1336 } 1337 1338 return true, nil 1339 }, func(err error) { 1340 t.Fatalf("err: %v", err) 1341 }) 1342 } 1343 1344 func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { 1345 t.Parallel() 1346 upd, ar := testAllocRunner(t, false) 1347 1348 // Create two tasks in the task group 1349 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1350 task.Driver = "mock_driver" 1351 task.KillTimeout = 10 * time.Millisecond 1352 task.Config = map[string]interface{}{ 1353 "run_for": "10s", 1354 } 1355 1356 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1357 task2.Name = "task 2" 1358 task2.Driver = "mock_driver" 1359 task2.Leader = true 1360 task2.Config = map[string]interface{}{ 1361 "run_for": "1s", 1362 } 1363 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1364 ar.alloc.TaskResources[task2.Name] = task2.Resources 1365 go ar.Run() 1366 defer ar.Destroy() 1367 1368 testutil.WaitForResult(func() (bool, error) { 1369 last := upd.Last() 1370 if last == nil { 1371 return false, fmt.Errorf("No updates") 1372 } 1373 if last.ClientStatus != structs.AllocClientStatusComplete { 1374 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1375 } 1376 1377 // Task One should be killed 1378 state1 := last.TaskStates[task.Name] 1379 if state1.State != structs.TaskStateDead { 1380 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1381 } 1382 if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { 1383 return false, fmt.Errorf("expected to have a start and finish time") 1384 } 1385 if len(state1.Events) < 2 { 1386 // At least have a received and destroyed 1387 return false, fmt.Errorf("Unexpected number of events") 1388 } 1389 1390 found := false 1391 for _, e := range state1.Events { 1392 if e.Type != structs.TaskLeaderDead { 1393 found = true 1394 } 1395 } 1396 1397 if !found { 1398 return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead) 1399 } 1400 1401 // Task Two should be dead 1402 state2 := last.TaskStates[task2.Name] 1403 if state2.State != structs.TaskStateDead { 1404 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1405 } 1406 if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { 1407 return false, fmt.Errorf("expected to have a start and finish time") 1408 } 1409 1410 return true, nil 1411 }, func(err error) { 1412 t.Fatalf("err: %v", err) 1413 }) 1414 } 1415 1416 // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group 1417 // with a leader the leader is stopped before other tasks. 1418 func TestAllocRunner_TaskLeader_StopTG(t *testing.T) { 1419 t.Parallel() 1420 upd, ar := testAllocRunner(t, false) 1421 1422 // Create 3 tasks in the task group 1423 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1424 task.Name = "follower1" 1425 task.Driver = "mock_driver" 1426 task.KillTimeout = 10 * time.Millisecond 1427 task.Config = map[string]interface{}{ 1428 "run_for": "10s", 1429 } 1430 1431 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1432 task2.Name = "leader" 1433 task2.Driver = "mock_driver" 1434 task2.Leader = true 1435 task2.KillTimeout = 10 * time.Millisecond 1436 task2.Config = map[string]interface{}{ 1437 "run_for": "10s", 1438 } 1439 1440 task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1441 task3.Name = "follower2" 1442 task3.Driver = "mock_driver" 1443 task3.KillTimeout = 10 * time.Millisecond 1444 task3.Config = map[string]interface{}{ 1445 "run_for": "10s", 1446 } 1447 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3) 1448 ar.alloc.TaskResources[task2.Name] = task2.Resources 1449 defer ar.Destroy() 1450 1451 go ar.Run() 1452 1453 // Wait for tasks to start 1454 last := upd.Last() 1455 testutil.WaitForResult(func() (bool, error) { 1456 last = upd.Last() 1457 if last == nil { 1458 return false, fmt.Errorf("No updates") 1459 } 1460 if n := len(last.TaskStates); n != 3 { 1461 return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n) 1462 } 1463 for name, state := range last.TaskStates { 1464 if state.State != structs.TaskStateRunning { 1465 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 1466 } 1467 } 1468 return true, nil 1469 }, func(err error) { 1470 t.Fatalf("err: %v", err) 1471 }) 1472 1473 // Reset updates 1474 upd.mu.Lock() 1475 upd.Allocs = upd.Allocs[:0] 1476 upd.mu.Unlock() 1477 1478 // Stop alloc 1479 update := ar.Alloc() 1480 update.DesiredStatus = structs.AllocDesiredStatusStop 1481 ar.Update(update) 1482 1483 // Wait for tasks to stop 1484 testutil.WaitForResult(func() (bool, error) { 1485 last := upd.Last() 1486 if last == nil { 1487 return false, fmt.Errorf("No updates") 1488 } 1489 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() { 1490 return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s", 1491 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt) 1492 } 1493 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() { 1494 return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s", 1495 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt) 1496 } 1497 return true, nil 1498 }, func(err error) { 1499 last := upd.Last() 1500 for name, state := range last.TaskStates { 1501 t.Logf("%s: %s", name, state.State) 1502 } 1503 t.Fatalf("err: %v", err) 1504 }) 1505 } 1506 1507 // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a 1508 // restored task group with a leader that failed before restoring the leader is 1509 // not stopped as it does not exist. 1510 // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932 1511 func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) { 1512 t.Parallel() 1513 _, ar := testAllocRunner(t, false) 1514 defer ar.Destroy() 1515 1516 // Create a leader and follower task in the task group 1517 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1518 task.Name = "follower1" 1519 task.Driver = "mock_driver" 1520 task.KillTimeout = 10 * time.Second 1521 task.Config = map[string]interface{}{ 1522 "run_for": "10s", 1523 } 1524 1525 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1526 task2.Name = "leader" 1527 task2.Driver = "mock_driver" 1528 task2.Leader = true 1529 task2.KillTimeout = 10 * time.Millisecond 1530 task2.Config = map[string]interface{}{ 1531 "run_for": "0s", 1532 } 1533 1534 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1535 ar.alloc.TaskResources[task2.Name] = task2.Resources 1536 1537 // Mimic Nomad exiting before the leader stopping is able to stop other tasks. 1538 ar.tasks = map[string]*TaskRunner{ 1539 "leader": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState, 1540 ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(), 1541 ar.vaultClient, ar.consulClient), 1542 "follower1": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState, 1543 ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(), 1544 ar.vaultClient, ar.consulClient), 1545 } 1546 ar.taskStates = map[string]*structs.TaskState{ 1547 "leader": {State: structs.TaskStateDead}, 1548 "follower1": {State: structs.TaskStateRunning}, 1549 } 1550 if err := ar.SaveState(); err != nil { 1551 t.Fatalf("error saving state: %v", err) 1552 } 1553 1554 // Create a new AllocRunner to test RestoreState and Run 1555 upd2 := &MockAllocStateUpdater{} 1556 ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc, 1557 ar.vaultClient, ar.consulClient, ar.prevAlloc) 1558 defer ar2.Destroy() 1559 1560 if err := ar2.RestoreState(); err != nil { 1561 t.Fatalf("error restoring state: %v", err) 1562 } 1563 go ar2.Run() 1564 1565 // Wait for tasks to be stopped because leader is dead 1566 testutil.WaitForResult(func() (bool, error) { 1567 last := upd2.Last() 1568 if last == nil { 1569 return false, fmt.Errorf("No updates") 1570 } 1571 if actual := last.TaskStates["leader"].State; actual != structs.TaskStateDead { 1572 return false, fmt.Errorf("Task leader is not dead yet (it's %q)", actual) 1573 } 1574 if actual := last.TaskStates["follower1"].State; actual != structs.TaskStateDead { 1575 return false, fmt.Errorf("Task follower1 is not dead yet (it's %q)", actual) 1576 } 1577 return true, nil 1578 }, func(err error) { 1579 last := upd2.Last() 1580 for name, state := range last.TaskStates { 1581 t.Logf("%s: %s", name, state.State) 1582 } 1583 t.Fatalf("err: %v", err) 1584 }) 1585 1586 // Make sure it GCs properly 1587 ar2.Destroy() 1588 1589 select { 1590 case <-ar2.WaitCh(): 1591 // exited as expected 1592 case <-time.After(10 * time.Second): 1593 t.Fatalf("timed out waiting for AR to GC") 1594 } 1595 } 1596 1597 // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's 1598 // local/ dir will be moved to a replacement alloc's local/ dir if sticky 1599 // volumes is on. 1600 func TestAllocRunner_MoveAllocDir(t *testing.T) { 1601 t.Parallel() 1602 // Create an alloc runner 1603 alloc := mock.Alloc() 1604 task := alloc.Job.TaskGroups[0].Tasks[0] 1605 task.Driver = "mock_driver" 1606 task.Config = map[string]interface{}{ 1607 "run_for": "1s", 1608 } 1609 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 1610 go ar.Run() 1611 defer ar.Destroy() 1612 1613 testutil.WaitForResult(func() (bool, error) { 1614 last := upd.Last() 1615 if last == nil { 1616 return false, fmt.Errorf("No updates") 1617 } 1618 if last.ClientStatus != structs.AllocClientStatusComplete { 1619 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1620 } 1621 return true, nil 1622 }, func(err error) { 1623 t.Fatalf("err: %v", err) 1624 }) 1625 1626 // Write some data in data dir and task dir of the alloc 1627 dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file") 1628 ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm) 1629 taskDir := ar.allocDir.TaskDirs[task.Name] 1630 taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file") 1631 ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm) 1632 1633 // Create another alloc runner 1634 alloc2 := mock.Alloc() 1635 alloc2.PreviousAllocation = ar.allocID 1636 alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true 1637 task = alloc2.Job.TaskGroups[0].Tasks[0] 1638 task.Driver = "mock_driver" 1639 task.Config = map[string]interface{}{ 1640 "run_for": "1s", 1641 } 1642 upd2, ar2 := testAllocRunnerFromAlloc(t, alloc2, false) 1643 1644 // Set prevAlloc like Client does 1645 ar2.prevAlloc = newAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "") 1646 1647 go ar2.Run() 1648 defer ar2.Destroy() 1649 1650 testutil.WaitForResult(func() (bool, error) { 1651 last := upd2.Last() 1652 if last == nil { 1653 return false, fmt.Errorf("No updates") 1654 } 1655 if last.ClientStatus != structs.AllocClientStatusComplete { 1656 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1657 } 1658 return true, nil 1659 }, func(err error) { 1660 t.Fatalf("err: %v", err) 1661 }) 1662 1663 // Ensure that data from ar was moved to ar2 1664 taskDir = ar2.allocDir.TaskDirs[task.Name] 1665 taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file") 1666 if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil { 1667 t.Fatalf("file %v not found", taskLocalFile) 1668 } 1669 1670 dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file") 1671 if fileInfo, _ := os.Stat(dataFile); fileInfo == nil { 1672 t.Fatalf("file %v not found", dataFile) 1673 } 1674 }