github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/alloc_runner_test.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "os" 7 "path/filepath" 8 "strings" 9 "sync" 10 "testing" 11 "text/template" 12 "time" 13 14 "github.com/boltdb/bolt" 15 "github.com/hashicorp/consul/api" 16 "github.com/hashicorp/go-multierror" 17 "github.com/hashicorp/nomad/command/agent/consul" 18 "github.com/hashicorp/nomad/helper/testlog" 19 "github.com/hashicorp/nomad/helper/uuid" 20 "github.com/hashicorp/nomad/nomad/mock" 21 "github.com/hashicorp/nomad/nomad/structs" 22 "github.com/hashicorp/nomad/testutil" 23 "github.com/hashicorp/nomad/version" 24 "github.com/kr/pretty" 25 "github.com/stretchr/testify/assert" 26 27 "github.com/hashicorp/nomad/client/config" 28 "github.com/hashicorp/nomad/client/vaultclient" 29 ) 30 31 type MockAllocStateUpdater struct { 32 Allocs []*structs.Allocation 33 mu sync.Mutex 34 } 35 36 // Update fulfills the TaskStateUpdater interface 37 func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) { 38 m.mu.Lock() 39 m.Allocs = append(m.Allocs, alloc) 40 m.mu.Unlock() 41 } 42 43 // Last returns the total number of updates and the last alloc (or nil) 44 func (m *MockAllocStateUpdater) Last() (int, *structs.Allocation) { 45 m.mu.Lock() 46 defer m.mu.Unlock() 47 n := len(m.Allocs) 48 if n == 0 { 49 return 0, nil 50 } 51 return n, m.Allocs[n-1].Copy() 52 } 53 54 // allocationBucketExists checks if the allocation bucket was created. 55 func allocationBucketExists(tx *bolt.Tx, allocID string) bool { 56 allocations := tx.Bucket(allocationsBucket) 57 if allocations == nil { 58 return false 59 } 60 61 // Retrieve the specific allocations bucket 62 alloc := allocations.Bucket([]byte(allocID)) 63 return alloc != nil 64 } 65 66 func testAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { 67 conf := config.DefaultConfig() 68 conf.Node = mock.Node() 69 conf.StateDir = os.TempDir() 70 conf.AllocDir = os.TempDir() 71 tmp, _ := ioutil.TempFile("", "state-db") 72 db, _ := bolt.Open(tmp.Name(), 0600, nil) 73 upd := &MockAllocStateUpdater{} 74 if !restarts { 75 *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} 76 alloc.Job.Type = structs.JobTypeBatch 77 } 78 vclient := vaultclient.NewMockVaultClient() 79 ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, newMockConsulServiceClient(t), noopPrevAlloc{}) 80 return upd, ar 81 } 82 83 func testAllocRunner(t *testing.T, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { 84 // Use mock driver 85 alloc := mock.Alloc() 86 task := alloc.Job.TaskGroups[0].Tasks[0] 87 task.Driver = "mock_driver" 88 task.Config["run_for"] = "500ms" 89 return testAllocRunnerFromAlloc(t, alloc, restarts) 90 } 91 92 func TestAllocRunner_SimpleRun(t *testing.T) { 93 t.Parallel() 94 upd, ar := testAllocRunner(t, false) 95 go ar.Run() 96 defer ar.Destroy() 97 98 testutil.WaitForResult(func() (bool, error) { 99 _, last := upd.Last() 100 if last == nil { 101 return false, fmt.Errorf("No updates") 102 } 103 if last.ClientStatus != structs.AllocClientStatusComplete { 104 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 105 } 106 return true, nil 107 }, func(err error) { 108 t.Fatalf("err: %v", err) 109 }) 110 } 111 112 // Test that the watcher will mark the allocation as unhealthy. 113 func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) { 114 t.Parallel() 115 assert := assert.New(t) 116 117 // Ensure the task fails and restarts 118 upd, ar := testAllocRunner(t, false) 119 120 // Make the task fail 121 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 122 task.Driver = "mock_driver" 123 task.Config["start_error"] = "test error" 124 125 // Make the alloc be part of a deployment 126 ar.alloc.DeploymentID = uuid.Generate() 127 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 128 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 129 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 130 131 go ar.Run() 132 defer ar.Destroy() 133 134 testutil.WaitForResult(func() (bool, error) { 135 _, last := upd.Last() 136 if last == nil { 137 return false, fmt.Errorf("No updates") 138 } 139 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 140 return false, fmt.Errorf("want deployment status unhealthy; got unset") 141 } else if *last.DeploymentStatus.Healthy { 142 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 143 } 144 return true, nil 145 }, func(err error) { 146 t.Fatalf("err: %v", err) 147 }) 148 149 // Assert that we have an event explaining why we are unhealthy. 150 assert.Len(ar.taskStates, 1) 151 state := ar.taskStates[task.Name] 152 assert.NotNil(state) 153 assert.NotEmpty(state.Events) 154 last := state.Events[len(state.Events)-1] 155 assert.Equal(allocHealthEventSource, last.Type) 156 assert.Contains(last.Message, "failed task") 157 } 158 159 // Test that the watcher will mark the allocation as unhealthy if it hits its 160 // deadline. 161 func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) { 162 t.Parallel() 163 assert := assert.New(t) 164 165 // Ensure the task fails and restarts 166 upd, ar := testAllocRunner(t, false) 167 168 // Make the task block 169 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 170 task.Driver = "mock_driver" 171 task.Config["start_block_for"] = "2s" 172 task.Config["run_for"] = "10s" 173 174 // Make the alloc be part of a deployment 175 ar.alloc.DeploymentID = uuid.Generate() 176 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 177 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 178 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 179 ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond 180 181 go ar.Run() 182 defer ar.Destroy() 183 184 testutil.WaitForResult(func() (bool, error) { 185 _, last := upd.Last() 186 if last == nil { 187 return false, fmt.Errorf("No updates") 188 } 189 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 190 return false, fmt.Errorf("want deployment status unhealthy; got unset") 191 } else if *last.DeploymentStatus.Healthy { 192 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 193 } 194 return true, nil 195 }, func(err error) { 196 t.Fatalf("err: %v", err) 197 }) 198 199 // Assert that we have an event explaining why we are unhealthy. 200 assert.Len(ar.taskStates, 1) 201 state := ar.taskStates[task.Name] 202 assert.NotNil(state) 203 assert.NotEmpty(state.Events) 204 last := state.Events[len(state.Events)-1] 205 assert.Equal(allocHealthEventSource, last.Type) 206 assert.Contains(last.Message, "not running by deadline") 207 } 208 209 // Test that the watcher will mark the allocation as healthy. 210 func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) { 211 t.Parallel() 212 213 // Ensure the task fails and restarts 214 upd, ar := testAllocRunner(t, false) 215 216 // Make the task run healthy 217 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 218 task.Driver = "mock_driver" 219 task.Config["run_for"] = "10s" 220 221 // Create a task that takes longer to become healthy 222 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy()) 223 task2 := ar.alloc.Job.TaskGroups[0].Tasks[1] 224 task2.Name = "task 2" 225 task2.Config["start_block_for"] = "500ms" 226 227 // Make the alloc be part of a deployment 228 ar.alloc.DeploymentID = uuid.Generate() 229 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 230 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 231 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 232 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 233 234 start := time.Now() 235 go ar.Run() 236 defer ar.Destroy() 237 238 testutil.WaitForResult(func() (bool, error) { 239 _, last := upd.Last() 240 if last == nil { 241 return false, fmt.Errorf("No updates") 242 } 243 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 244 return false, fmt.Errorf("want deployment status unhealthy; got unset") 245 } else if !*last.DeploymentStatus.Healthy { 246 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 247 } 248 return true, nil 249 }, func(err error) { 250 t.Fatalf("err: %v", err) 251 }) 252 if d := time.Now().Sub(start); d < 500*time.Millisecond { 253 t.Fatalf("didn't wait for second task group. Only took %v", d) 254 } 255 } 256 257 // Test that the watcher will mark the allocation as healthy with checks 258 func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) { 259 t.Parallel() 260 261 // Ensure the task fails and restarts 262 upd, ar := testAllocRunner(t, false) 263 264 // Make the task fail 265 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 266 task.Driver = "mock_driver" 267 task.Config["run_for"] = "10s" 268 269 // Create a task that has no checks 270 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy()) 271 task2 := ar.alloc.Job.TaskGroups[0].Tasks[1] 272 task2.Name = "task 2" 273 task2.Services = nil 274 275 // Make the alloc be part of a deployment 276 ar.alloc.DeploymentID = uuid.Generate() 277 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 278 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 279 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 280 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 281 282 checkHealthy := &api.AgentCheck{ 283 CheckID: uuid.Generate(), 284 Status: api.HealthPassing, 285 } 286 checkUnhealthy := &api.AgentCheck{ 287 CheckID: checkHealthy.CheckID, 288 Status: api.HealthWarning, 289 } 290 291 // Only return the check as healthy after a duration 292 trigger := time.After(500 * time.Millisecond) 293 ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 294 select { 295 case <-trigger: 296 return &consul.AllocRegistration{ 297 Tasks: map[string]*consul.TaskRegistration{ 298 task.Name: { 299 Services: map[string]*consul.ServiceRegistration{ 300 "123": { 301 Service: &api.AgentService{Service: "foo"}, 302 Checks: []*api.AgentCheck{checkHealthy}, 303 }, 304 }, 305 }, 306 }, 307 }, nil 308 default: 309 return &consul.AllocRegistration{ 310 Tasks: map[string]*consul.TaskRegistration{ 311 task.Name: { 312 Services: map[string]*consul.ServiceRegistration{ 313 "123": { 314 Service: &api.AgentService{Service: "foo"}, 315 Checks: []*api.AgentCheck{checkUnhealthy}, 316 }, 317 }, 318 }, 319 }, 320 }, nil 321 } 322 } 323 324 start := time.Now() 325 go ar.Run() 326 defer ar.Destroy() 327 328 testutil.WaitForResult(func() (bool, error) { 329 _, last := upd.Last() 330 if last == nil { 331 return false, fmt.Errorf("No updates") 332 } 333 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 334 return false, fmt.Errorf("want deployment status unhealthy; got unset") 335 } else if !*last.DeploymentStatus.Healthy { 336 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 337 } 338 return true, nil 339 }, func(err error) { 340 t.Fatalf("err: %v", err) 341 }) 342 343 if d := time.Now().Sub(start); d < 500*time.Millisecond { 344 t.Fatalf("didn't wait for second task group. Only took %v", d) 345 } 346 } 347 348 // Test that the watcher will mark the allocation as unhealthy with failing 349 // checks 350 func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) { 351 t.Parallel() 352 assert := assert.New(t) 353 354 // Ensure the task fails and restarts 355 upd, ar := testAllocRunner(t, false) 356 357 // Make the task fail 358 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 359 task.Driver = "mock_driver" 360 task.Config["run_for"] = "10s" 361 362 // Make the alloc be part of a deployment 363 ar.alloc.DeploymentID = uuid.Generate() 364 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 365 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks 366 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 367 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 368 ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second 369 370 checkUnhealthy := &api.AgentCheck{ 371 CheckID: uuid.Generate(), 372 Status: api.HealthWarning, 373 } 374 375 // Only return the check as healthy after a duration 376 ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) { 377 return &consul.AllocRegistration{ 378 Tasks: map[string]*consul.TaskRegistration{ 379 task.Name: { 380 Services: map[string]*consul.ServiceRegistration{ 381 "123": { 382 Service: &api.AgentService{Service: "foo"}, 383 Checks: []*api.AgentCheck{checkUnhealthy}, 384 }, 385 }, 386 }, 387 }, 388 }, nil 389 } 390 391 go ar.Run() 392 defer ar.Destroy() 393 394 testutil.WaitForResult(func() (bool, error) { 395 _, last := upd.Last() 396 if last == nil { 397 return false, fmt.Errorf("No updates") 398 } 399 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 400 return false, fmt.Errorf("want deployment status unhealthy; got unset") 401 } else if *last.DeploymentStatus.Healthy { 402 return false, fmt.Errorf("want deployment status unhealthy; got healthy") 403 } 404 return true, nil 405 }, func(err error) { 406 t.Fatalf("err: %v", err) 407 }) 408 409 // Assert that we have an event explaining why we are unhealthy. 410 assert.Len(ar.taskStates, 1) 411 state := ar.taskStates[task.Name] 412 assert.NotNil(state) 413 assert.NotEmpty(state.Events) 414 last := state.Events[len(state.Events)-1] 415 assert.Equal(allocHealthEventSource, last.Type) 416 assert.Contains(last.Message, "Services not healthy by deadline") 417 } 418 419 // Test that the watcher will mark the allocation as healthy. 420 func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) { 421 t.Parallel() 422 423 // Ensure the task fails and restarts 424 upd, ar := testAllocRunner(t, false) 425 426 // Make the task run healthy 427 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 428 task.Driver = "mock_driver" 429 task.Config["run_for"] = "30s" 430 431 // Make the alloc be part of a deployment 432 ar.alloc.DeploymentID = uuid.Generate() 433 ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() 434 ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates 435 ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1 436 ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond 437 438 go ar.Run() 439 defer ar.Destroy() 440 441 testutil.WaitForResult(func() (bool, error) { 442 _, last := upd.Last() 443 if last == nil { 444 return false, fmt.Errorf("No updates") 445 } 446 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 447 return false, fmt.Errorf("want deployment status unhealthy; got unset") 448 } else if !*last.DeploymentStatus.Healthy { 449 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 450 } 451 return true, nil 452 }, func(err error) { 453 t.Fatalf("err: %v", err) 454 }) 455 456 // Mimick an update to a new deployment id 457 oldCount, last := upd.Last() 458 last.DeploymentStatus = nil 459 last.DeploymentID = uuid.Generate() 460 ar.Update(last) 461 462 testutil.WaitForResult(func() (bool, error) { 463 newCount, last := upd.Last() 464 if newCount <= oldCount { 465 return false, fmt.Errorf("No new updates") 466 } 467 if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil { 468 return false, fmt.Errorf("want deployment status unhealthy; got unset") 469 } else if !*last.DeploymentStatus.Healthy { 470 return false, fmt.Errorf("want deployment status healthy; got unhealthy") 471 } 472 return true, nil 473 }, func(err error) { 474 t.Fatalf("err: %v", err) 475 }) 476 } 477 478 // TestAllocRuner_RetryArtifact ensures that if one task in a task group is 479 // retrying fetching an artifact, other tasks in the group should be able 480 // to proceed. 481 func TestAllocRunner_RetryArtifact(t *testing.T) { 482 t.Parallel() 483 484 alloc := mock.Alloc() 485 alloc.Job.Type = structs.JobTypeBatch 486 alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail 487 alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1 488 alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second 489 490 task := alloc.Job.TaskGroups[0].Tasks[0] 491 task.Driver = "mock_driver" 492 task.Config = map[string]interface{}{ 493 "exit_code": "0", 494 "run_for": "1s", 495 } 496 497 // Create a new task with a bad artifact 498 badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() 499 badtask.Name = "bad" 500 badtask.Artifacts = []*structs.TaskArtifact{ 501 {GetterSource: "http://127.0.0.1:0/foo/bar/baz"}, 502 } 503 504 alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) 505 upd, ar := testAllocRunnerFromAlloc(t, alloc, true) 506 go ar.Run() 507 defer ar.Destroy() 508 509 testutil.WaitForResult(func() (bool, error) { 510 count, last := upd.Last() 511 if min := 6; count < min { 512 return false, fmt.Errorf("Not enough updates (%d < %d)", count, min) 513 } 514 515 // web task should have completed successfully while bad task 516 // retries artififact fetching 517 webstate := last.TaskStates["web"] 518 if webstate.State != structs.TaskStateDead { 519 return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State) 520 } 521 if !webstate.Successful() { 522 return false, fmt.Errorf("expected web to have exited successfully") 523 } 524 525 // bad task should have failed 526 badstate := last.TaskStates["bad"] 527 if badstate.State != structs.TaskStateDead { 528 return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State) 529 } 530 if !badstate.Failed { 531 return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events) 532 } 533 return true, nil 534 }, func(err error) { 535 t.Fatalf("err: %v", err) 536 }) 537 } 538 539 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { 540 t.Parallel() 541 upd, ar := testAllocRunner(t, false) 542 543 // Ensure task takes some time 544 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 545 task.Driver = "mock_driver" 546 task.Config["run_for"] = "10s" 547 go ar.Run() 548 549 testutil.WaitForResult(func() (bool, error) { 550 _, last := upd.Last() 551 if last == nil { 552 return false, fmt.Errorf("No updates") 553 } 554 if last.ClientStatus != structs.AllocClientStatusRunning { 555 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 556 } 557 return true, nil 558 }, func(err error) { 559 t.Fatalf("err: %v", err) 560 }) 561 562 // Update the alloc to be terminal which should cause the alloc runner to 563 // stop the tasks and wait for a destroy. 564 update := ar.alloc.Copy() 565 update.DesiredStatus = structs.AllocDesiredStatusStop 566 ar.Update(update) 567 568 testutil.WaitForResult(func() (bool, error) { 569 _, last := upd.Last() 570 if last == nil { 571 return false, fmt.Errorf("No updates") 572 } 573 574 // Check the status has changed. 575 if last.ClientStatus != structs.AllocClientStatusComplete { 576 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 577 } 578 579 // Check the allocation state still exists 580 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 581 if !allocationBucketExists(tx, ar.Alloc().ID) { 582 return fmt.Errorf("no bucket for alloc") 583 } 584 585 return nil 586 }); err != nil { 587 return false, fmt.Errorf("state destroyed") 588 } 589 590 // Check the alloc directory still exists 591 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 592 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 593 } 594 595 return true, nil 596 }, func(err error) { 597 t.Fatalf("err: %v", err) 598 }) 599 600 // Send the destroy signal and ensure the AllocRunner cleans up. 601 ar.Destroy() 602 603 testutil.WaitForResult(func() (bool, error) { 604 _, last := upd.Last() 605 if last == nil { 606 return false, fmt.Errorf("No updates") 607 } 608 609 // Check the status has changed. 610 if last.ClientStatus != structs.AllocClientStatusComplete { 611 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 612 } 613 614 // Check the state was cleaned 615 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 616 if allocationBucketExists(tx, ar.Alloc().ID) { 617 return fmt.Errorf("bucket for alloc exists") 618 } 619 620 return nil 621 }); err != nil { 622 return false, fmt.Errorf("state not destroyed") 623 } 624 625 // Check the alloc directory was cleaned 626 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 627 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 628 } else if !os.IsNotExist(err) { 629 return false, fmt.Errorf("stat err: %v", err) 630 } 631 632 return true, nil 633 }, func(err error) { 634 t.Fatalf("err: %v", err) 635 }) 636 } 637 638 func TestAllocRunner_Destroy(t *testing.T) { 639 t.Parallel() 640 upd, ar := testAllocRunner(t, false) 641 642 // Ensure task takes some time 643 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 644 task.Driver = "mock_driver" 645 task.Config["run_for"] = "10s" 646 go ar.Run() 647 start := time.Now() 648 649 // Begin the tear down 650 go func() { 651 time.Sleep(1 * time.Second) 652 ar.Destroy() 653 }() 654 655 testutil.WaitForResult(func() (bool, error) { 656 _, last := upd.Last() 657 if last == nil { 658 return false, fmt.Errorf("No updates") 659 } 660 661 // Check the status has changed. 662 if last.ClientStatus != structs.AllocClientStatusComplete { 663 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 664 } 665 666 // Check the state was cleaned 667 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 668 if allocationBucketExists(tx, ar.Alloc().ID) { 669 return fmt.Errorf("bucket for alloc exists") 670 } 671 672 return nil 673 }); err != nil { 674 return false, fmt.Errorf("state not destroyed: %v", err) 675 } 676 677 // Check the alloc directory was cleaned 678 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 679 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 680 } else if !os.IsNotExist(err) { 681 return false, fmt.Errorf("stat err: %v", err) 682 } 683 684 return true, nil 685 }, func(err error) { 686 t.Fatalf("err: %v", err) 687 }) 688 689 if elapsed := time.Since(start); elapsed > 20*time.Second { 690 t.Fatalf("took too long to terminate: %s", elapsed) 691 } 692 } 693 694 func TestAllocRunner_Update(t *testing.T) { 695 t.Parallel() 696 _, ar := testAllocRunner(t, false) 697 698 // Deep copy the alloc to avoid races when updating 699 newAlloc := ar.Alloc().Copy() 700 701 // Ensure task takes some time 702 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 703 task.Driver = "mock_driver" 704 task.Config["run_for"] = "10s" 705 go ar.Run() 706 defer ar.Destroy() 707 708 // Update the alloc definition 709 newAlloc.Name = "FOO" 710 newAlloc.AllocModifyIndex++ 711 ar.Update(newAlloc) 712 713 // Check the alloc runner stores the update allocation. 714 testutil.WaitForResult(func() (bool, error) { 715 return ar.Alloc().Name == "FOO", nil 716 }, func(err error) { 717 t.Fatalf("err: %v %#v", err, ar.Alloc()) 718 }) 719 } 720 721 func TestAllocRunner_SaveRestoreState(t *testing.T) { 722 t.Parallel() 723 alloc := mock.Alloc() 724 task := alloc.Job.TaskGroups[0].Tasks[0] 725 task.Driver = "mock_driver" 726 task.Config = map[string]interface{}{ 727 "exit_code": "0", 728 "run_for": "10s", 729 } 730 731 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 732 go ar.Run() 733 defer ar.Destroy() 734 735 // Snapshot state 736 testutil.WaitForResult(func() (bool, error) { 737 ar.taskLock.RLock() 738 defer ar.taskLock.RUnlock() 739 return len(ar.tasks) == 1, nil 740 }, func(err error) { 741 t.Fatalf("task never started: %v", err) 742 }) 743 744 err := ar.SaveState() 745 if err != nil { 746 t.Fatalf("err: %v", err) 747 } 748 749 // Create a new alloc runner 750 l2 := prefixedTestLogger("----- ar2: ") 751 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 752 prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "") 753 ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update, 754 alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 755 err = ar2.RestoreState() 756 if err != nil { 757 t.Fatalf("err: %v", err) 758 } 759 go ar2.Run() 760 761 testutil.WaitForResult(func() (bool, error) { 762 if len(ar2.tasks) != 1 { 763 return false, fmt.Errorf("Incorrect number of tasks") 764 } 765 766 _, last := upd.Last() 767 if last == nil { 768 return false, nil 769 } 770 771 return last.ClientStatus == structs.AllocClientStatusRunning, nil 772 }, func(err error) { 773 _, last := upd.Last() 774 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"]) 775 }) 776 777 // Destroy and wait 778 ar2.Destroy() 779 start := time.Now() 780 781 testutil.WaitForResult(func() (bool, error) { 782 alloc := ar2.Alloc() 783 if alloc.ClientStatus != structs.AllocClientStatusComplete { 784 return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete) 785 } 786 return true, nil 787 }, func(err error) { 788 _, last := upd.Last() 789 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 790 }) 791 792 if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second { 793 t.Fatalf("took too long to terminate") 794 } 795 } 796 797 func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) { 798 t.Parallel() 799 upd, ar := testAllocRunner(t, false) 800 ar.logger = prefixedTestLogger("ar1: ") 801 802 // Ensure task takes some time 803 ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver" 804 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 805 task.Config["run_for"] = "10s" 806 go ar.Run() 807 defer ar.Destroy() 808 809 testutil.WaitForResult(func() (bool, error) { 810 _, last := upd.Last() 811 if last == nil { 812 return false, fmt.Errorf("No updates") 813 } 814 815 if last.ClientStatus != structs.AllocClientStatusRunning { 816 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 817 } 818 return true, nil 819 }, func(err error) { 820 t.Fatalf("err: %v", err) 821 }) 822 823 // Update the alloc to be terminal which should cause the alloc runner to 824 // stop the tasks and wait for a destroy. 825 update := ar.alloc.Copy() 826 update.DesiredStatus = structs.AllocDesiredStatusStop 827 ar.Update(update) 828 829 testutil.WaitForResult(func() (bool, error) { 830 return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil 831 }, func(err error) { 832 t.Fatalf("err: %v", err) 833 }) 834 835 err := ar.SaveState() 836 if err != nil { 837 t.Fatalf("err: %v", err) 838 } 839 840 // Ensure ar1 doesn't recreate the state file 841 ar.allocLock.Lock() 842 defer ar.allocLock.Unlock() 843 844 // Create a new alloc runner 845 l2 := prefixedTestLogger("ar2: ") 846 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 847 prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "") 848 ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update, 849 alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 850 err = ar2.RestoreState() 851 if err != nil { 852 t.Fatalf("err: %v", err) 853 } 854 ar2.logger.Println("[TESTING] running second alloc runner") 855 go ar2.Run() 856 defer ar2.Destroy() // Just-in-case of failure before Destroy below 857 858 testutil.WaitForResult(func() (bool, error) { 859 // Check the state still exists 860 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 861 if !allocationBucketExists(tx, ar2.Alloc().ID) { 862 return fmt.Errorf("no bucket for alloc") 863 } 864 865 return nil 866 }); err != nil { 867 return false, fmt.Errorf("state destroyed") 868 } 869 870 // Check the alloc directory still exists 871 if _, err := os.Stat(ar.allocDir.AllocDir); err != nil { 872 return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir) 873 } 874 875 return true, nil 876 }, func(err error) { 877 _, last := upd.Last() 878 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 879 }) 880 881 // Send the destroy signal and ensure the AllocRunner cleans up. 882 ar2.logger.Println("[TESTING] destroying second alloc runner") 883 ar2.Destroy() 884 885 testutil.WaitForResult(func() (bool, error) { 886 _, last := upd.Last() 887 if last == nil { 888 return false, fmt.Errorf("No updates") 889 } 890 891 // Check the status has changed. 892 if last.ClientStatus != structs.AllocClientStatusComplete { 893 return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 894 } 895 896 // Check the state was cleaned 897 if err := ar.stateDB.View(func(tx *bolt.Tx) error { 898 if allocationBucketExists(tx, ar2.Alloc().ID) { 899 return fmt.Errorf("bucket for alloc exists") 900 } 901 902 return nil 903 }); err != nil { 904 return false, fmt.Errorf("state not destroyed") 905 } 906 907 // Check the alloc directory was cleaned 908 if _, err := os.Stat(ar.allocDir.AllocDir); err == nil { 909 return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir) 910 } else if !os.IsNotExist(err) { 911 return false, fmt.Errorf("stat err: %v", err) 912 } 913 914 return true, nil 915 }, func(err error) { 916 t.Fatalf("err: %v", err) 917 }) 918 } 919 920 // TestAllocRunner_SaveRestoreState_Upgrade asserts that pre-0.6 exec tasks are 921 // restarted on upgrade. 922 func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) { 923 t.Parallel() 924 alloc := mock.Alloc() 925 task := alloc.Job.TaskGroups[0].Tasks[0] 926 task.Driver = "mock_driver" 927 task.Config = map[string]interface{}{ 928 "exit_code": "0", 929 "run_for": "10s", 930 } 931 932 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 933 // Hack in old version to cause an upgrade on RestoreState 934 origConfig := ar.config.Copy() 935 ar.config.Version = &version.VersionInfo{Version: "0.5.6"} 936 go ar.Run() 937 defer ar.Destroy() 938 939 // Snapshot state 940 testutil.WaitForResult(func() (bool, error) { 941 _, last := upd.Last() 942 if last == nil { 943 return false, fmt.Errorf("No updates") 944 } 945 946 if last.ClientStatus != structs.AllocClientStatusRunning { 947 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) 948 } 949 return true, nil 950 }, func(err error) { 951 t.Fatalf("task never started: %v", err) 952 }) 953 954 err := ar.SaveState() 955 if err != nil { 956 t.Fatalf("err: %v", err) 957 } 958 959 // Create a new alloc runner 960 l2 := prefixedTestLogger("ar2: ") 961 alloc2 := &structs.Allocation{ID: ar.alloc.ID} 962 prevAlloc := newAllocWatcher(alloc2, ar, nil, origConfig, l2, "") 963 ar2 := NewAllocRunner(l2, origConfig, ar.stateDB, upd.Update, alloc2, ar.vaultClient, ar.consulClient, prevAlloc) 964 err = ar2.RestoreState() 965 if err != nil { 966 t.Fatalf("err: %v", err) 967 } 968 go ar2.Run() 969 defer ar2.Destroy() // Just-in-case of failure before Destroy below 970 971 testutil.WaitForResult(func() (bool, error) { 972 count, last := upd.Last() 973 if min := 3; count < min { 974 return false, fmt.Errorf("expected at least %d updates but found %d", min, count) 975 } 976 for _, ev := range last.TaskStates["web"].Events { 977 if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) { 978 return true, nil 979 } 980 } 981 return false, fmt.Errorf("no restart with proper reason found") 982 }, func(err error) { 983 count, last := upd.Last() 984 t.Fatalf("err: %v\nAllocs: %d\nweb state: % #v", err, count, pretty.Formatter(last.TaskStates["web"])) 985 }) 986 987 // Destroy and wait 988 ar2.Destroy() 989 start := time.Now() 990 991 testutil.WaitForResult(func() (bool, error) { 992 alloc := ar2.Alloc() 993 if alloc.ClientStatus != structs.AllocClientStatusComplete { 994 return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete) 995 } 996 return true, nil 997 }, func(err error) { 998 _, last := upd.Last() 999 t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates) 1000 }) 1001 1002 if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second { 1003 t.Fatalf("took too long to terminate") 1004 } 1005 } 1006 1007 // Ensure pre-#2132 state files containing the Context struct are properly 1008 // migrated to the new format. 1009 // 1010 // Old Context State: 1011 // 1012 // "Context": { 1013 // "AllocDir": { 1014 // "AllocDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb", 1015 // "SharedDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/alloc", 1016 // "TaskDirs": { 1017 // "echo1": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/echo1" 1018 // } 1019 // }, 1020 // "AllocID": "2a54fcff-fc44-8d4f-e025-53c48e9cbbbb" 1021 // } 1022 func TestAllocRunner_RestoreOldState(t *testing.T) { 1023 t.Parallel() 1024 alloc := mock.Alloc() 1025 task := alloc.Job.TaskGroups[0].Tasks[0] 1026 task.Driver = "mock_driver" 1027 task.Config = map[string]interface{}{ 1028 "exit_code": "0", 1029 "run_for": "10s", 1030 } 1031 1032 logger := testLogger() 1033 conf := config.DefaultConfig() 1034 conf.Node = mock.Node() 1035 conf.StateDir = os.TempDir() 1036 conf.AllocDir = os.TempDir() 1037 tmp, err := ioutil.TempFile("", "state-db") 1038 if err != nil { 1039 t.Fatalf("error creating state db file: %v", err) 1040 } 1041 db, err := bolt.Open(tmp.Name(), 0600, nil) 1042 if err != nil { 1043 t.Fatalf("error creating state db: %v", err) 1044 } 1045 1046 if err := os.MkdirAll(filepath.Join(conf.StateDir, "alloc", alloc.ID), 0777); err != nil { 1047 t.Fatalf("error creating state dir: %v", err) 1048 } 1049 statePath := filepath.Join(conf.StateDir, "alloc", alloc.ID, "state.json") 1050 w, err := os.Create(statePath) 1051 if err != nil { 1052 t.Fatalf("error creating state file: %v", err) 1053 } 1054 tmplctx := &struct { 1055 AllocID string 1056 AllocDir string 1057 }{alloc.ID, conf.AllocDir} 1058 err = template.Must(template.New("test_state").Parse(`{ 1059 "Version": "0.5.1", 1060 "Alloc": { 1061 "ID": "{{ .AllocID }}", 1062 "Name": "example", 1063 "JobID": "example", 1064 "Job": { 1065 "ID": "example", 1066 "Name": "example", 1067 "Type": "batch", 1068 "TaskGroups": [ 1069 { 1070 "Name": "example", 1071 "Tasks": [ 1072 { 1073 "Name": "example", 1074 "Driver": "mock", 1075 "Config": { 1076 "exit_code": "0", 1077 "run_for": "10s" 1078 } 1079 } 1080 ] 1081 } 1082 ] 1083 }, 1084 "TaskGroup": "example", 1085 "DesiredStatus": "run", 1086 "ClientStatus": "running", 1087 "TaskStates": { 1088 "example": { 1089 "State": "running", 1090 "Failed": false, 1091 "Events": [] 1092 } 1093 } 1094 }, 1095 "Context": { 1096 "AllocDir": { 1097 "AllocDir": "{{ .AllocDir }}/{{ .AllocID }}", 1098 "SharedDir": "{{ .AllocDir }}/{{ .AllocID }}/alloc", 1099 "TaskDirs": { 1100 "example": "{{ .AllocDir }}/{{ .AllocID }}/example" 1101 } 1102 }, 1103 "AllocID": "{{ .AllocID }}" 1104 } 1105 }`)).Execute(w, tmplctx) 1106 if err != nil { 1107 t.Fatalf("error writing state file: %v", err) 1108 } 1109 w.Close() 1110 1111 upd := &MockAllocStateUpdater{} 1112 *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} 1113 alloc.Job.Type = structs.JobTypeBatch 1114 vclient := vaultclient.NewMockVaultClient() 1115 cclient := newMockConsulServiceClient(t) 1116 ar := NewAllocRunner(logger, conf, db, upd.Update, alloc, vclient, cclient, noopPrevAlloc{}) 1117 defer ar.Destroy() 1118 1119 // RestoreState should fail on the task state since we only test the 1120 // alloc state restoring. 1121 err = ar.RestoreState() 1122 if err == nil { 1123 t.Fatal("expected error restoring Task state") 1124 } 1125 merr, ok := err.(*multierror.Error) 1126 if !ok { 1127 t.Fatalf("expected RestoreState to return a multierror but found: %T -> %v", err, err) 1128 } 1129 if len(merr.Errors) != 1 { 1130 t.Fatalf("expected exactly 1 error from RestoreState but found: %d: %v", len(merr.Errors), err) 1131 } 1132 if expected := "failed to get task bucket"; !strings.Contains(merr.Errors[0].Error(), expected) { 1133 t.Fatalf("expected %q but got: %q", expected, merr.Errors[0].Error()) 1134 } 1135 1136 if err := ar.SaveState(); err != nil { 1137 t.Fatalf("error saving new state: %v", err) 1138 } 1139 } 1140 1141 func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { 1142 t.Parallel() 1143 upd, ar := testAllocRunner(t, false) 1144 1145 // Create two tasks in the task group 1146 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1147 task.Driver = "mock_driver" 1148 task.KillTimeout = 10 * time.Millisecond 1149 task.Config = map[string]interface{}{ 1150 "run_for": "10s", 1151 } 1152 1153 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1154 task2.Name = "task 2" 1155 task2.Driver = "mock_driver" 1156 task2.Config = map[string]interface{}{ 1157 "start_error": "fail task please", 1158 } 1159 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1160 ar.alloc.TaskResources[task2.Name] = task2.Resources 1161 go ar.Run() 1162 defer ar.Destroy() 1163 1164 testutil.WaitForResult(func() (bool, error) { 1165 _, last := upd.Last() 1166 if last == nil { 1167 return false, fmt.Errorf("No updates") 1168 } 1169 if last.ClientStatus != structs.AllocClientStatusFailed { 1170 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed) 1171 } 1172 1173 // Task One should be killed 1174 state1 := last.TaskStates[task.Name] 1175 if state1.State != structs.TaskStateDead { 1176 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1177 } 1178 if len(state1.Events) < 2 { 1179 // At least have a received and destroyed 1180 return false, fmt.Errorf("Unexpected number of events") 1181 } 1182 1183 found := false 1184 for _, e := range state1.Events { 1185 if e.Type != structs.TaskSiblingFailed { 1186 found = true 1187 } 1188 } 1189 1190 if !found { 1191 return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed) 1192 } 1193 1194 // Task Two should be failed 1195 state2 := last.TaskStates[task2.Name] 1196 if state2.State != structs.TaskStateDead { 1197 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1198 } 1199 if !state2.Failed { 1200 return false, fmt.Errorf("task2 should have failed") 1201 } 1202 1203 return true, nil 1204 }, func(err error) { 1205 t.Fatalf("err: %v", err) 1206 }) 1207 } 1208 1209 func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { 1210 t.Parallel() 1211 upd, ar := testAllocRunner(t, false) 1212 1213 // Create two tasks in the task group 1214 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1215 task.Driver = "mock_driver" 1216 task.KillTimeout = 10 * time.Millisecond 1217 task.Config = map[string]interface{}{ 1218 "run_for": "10s", 1219 } 1220 1221 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1222 task2.Name = "task 2" 1223 task2.Driver = "mock_driver" 1224 task2.Leader = true 1225 task2.Config = map[string]interface{}{ 1226 "run_for": "1s", 1227 } 1228 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1229 ar.alloc.TaskResources[task2.Name] = task2.Resources 1230 go ar.Run() 1231 defer ar.Destroy() 1232 1233 testutil.WaitForResult(func() (bool, error) { 1234 _, last := upd.Last() 1235 if last == nil { 1236 return false, fmt.Errorf("No updates") 1237 } 1238 if last.ClientStatus != structs.AllocClientStatusComplete { 1239 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1240 } 1241 1242 // Task One should be killed 1243 state1 := last.TaskStates[task.Name] 1244 if state1.State != structs.TaskStateDead { 1245 return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) 1246 } 1247 if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { 1248 return false, fmt.Errorf("expected to have a start and finish time") 1249 } 1250 if len(state1.Events) < 2 { 1251 // At least have a received and destroyed 1252 return false, fmt.Errorf("Unexpected number of events") 1253 } 1254 1255 found := false 1256 for _, e := range state1.Events { 1257 if e.Type != structs.TaskLeaderDead { 1258 found = true 1259 } 1260 } 1261 1262 if !found { 1263 return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead) 1264 } 1265 1266 // Task Two should be dead 1267 state2 := last.TaskStates[task2.Name] 1268 if state2.State != structs.TaskStateDead { 1269 return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) 1270 } 1271 if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { 1272 return false, fmt.Errorf("expected to have a start and finish time") 1273 } 1274 1275 return true, nil 1276 }, func(err error) { 1277 t.Fatalf("err: %v", err) 1278 }) 1279 } 1280 1281 // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group 1282 // with a leader the leader is stopped before other tasks. 1283 func TestAllocRunner_TaskLeader_StopTG(t *testing.T) { 1284 t.Parallel() 1285 upd, ar := testAllocRunner(t, false) 1286 1287 // Create 3 tasks in the task group 1288 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1289 task.Name = "follower1" 1290 task.Driver = "mock_driver" 1291 task.KillTimeout = 10 * time.Millisecond 1292 task.Config = map[string]interface{}{ 1293 "run_for": "10s", 1294 } 1295 1296 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1297 task2.Name = "leader" 1298 task2.Driver = "mock_driver" 1299 task2.Leader = true 1300 task2.KillTimeout = 10 * time.Millisecond 1301 task2.Config = map[string]interface{}{ 1302 "run_for": "10s", 1303 } 1304 1305 task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1306 task3.Name = "follower2" 1307 task3.Driver = "mock_driver" 1308 task3.KillTimeout = 10 * time.Millisecond 1309 task3.Config = map[string]interface{}{ 1310 "run_for": "10s", 1311 } 1312 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3) 1313 ar.alloc.TaskResources[task2.Name] = task2.Resources 1314 defer ar.Destroy() 1315 1316 go ar.Run() 1317 1318 // Wait for tasks to start 1319 oldCount, last := upd.Last() 1320 testutil.WaitForResult(func() (bool, error) { 1321 oldCount, last = upd.Last() 1322 if last == nil { 1323 return false, fmt.Errorf("No updates") 1324 } 1325 if n := len(last.TaskStates); n != 3 { 1326 return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n) 1327 } 1328 for name, state := range last.TaskStates { 1329 if state.State != structs.TaskStateRunning { 1330 return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State) 1331 } 1332 } 1333 return true, nil 1334 }, func(err error) { 1335 t.Fatalf("err: %v", err) 1336 }) 1337 1338 // Stop alloc 1339 update := ar.Alloc() 1340 update.DesiredStatus = structs.AllocDesiredStatusStop 1341 ar.Update(update) 1342 1343 // Wait for tasks to stop 1344 testutil.WaitForResult(func() (bool, error) { 1345 newCount, last := upd.Last() 1346 if newCount == oldCount { 1347 return false, fmt.Errorf("no new updates (count: %d)", newCount) 1348 } 1349 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() { 1350 return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s", 1351 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt) 1352 } 1353 if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() { 1354 return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s", 1355 last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt) 1356 } 1357 return true, nil 1358 }, func(err error) { 1359 count, last := upd.Last() 1360 t.Logf("Updates: %d", count) 1361 for name, state := range last.TaskStates { 1362 t.Logf("%s: %s", name, state.State) 1363 } 1364 t.Fatalf("err: %v", err) 1365 }) 1366 } 1367 1368 // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a 1369 // restored task group with a leader that failed before restoring the leader is 1370 // not stopped as it does not exist. 1371 // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932 1372 func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) { 1373 t.Parallel() 1374 _, ar := testAllocRunner(t, false) 1375 defer ar.Destroy() 1376 1377 // Create a leader and follower task in the task group 1378 task := ar.alloc.Job.TaskGroups[0].Tasks[0] 1379 task.Name = "follower1" 1380 task.Driver = "mock_driver" 1381 task.KillTimeout = 10 * time.Second 1382 task.Config = map[string]interface{}{ 1383 "run_for": "10s", 1384 } 1385 1386 task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy() 1387 task2.Name = "leader" 1388 task2.Driver = "mock_driver" 1389 task2.Leader = true 1390 task2.KillTimeout = 10 * time.Millisecond 1391 task2.Config = map[string]interface{}{ 1392 "run_for": "0s", 1393 } 1394 1395 ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2) 1396 ar.alloc.TaskResources[task2.Name] = task2.Resources 1397 1398 // Mimic Nomad exiting before the leader stopping is able to stop other tasks. 1399 ar.tasks = map[string]*TaskRunner{ 1400 "leader": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState, 1401 ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(), 1402 ar.vaultClient, ar.consulClient), 1403 "follower1": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState, 1404 ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(), 1405 ar.vaultClient, ar.consulClient), 1406 } 1407 ar.taskStates = map[string]*structs.TaskState{ 1408 "leader": {State: structs.TaskStateDead}, 1409 "follower1": {State: structs.TaskStateRunning}, 1410 } 1411 if err := ar.SaveState(); err != nil { 1412 t.Fatalf("error saving state: %v", err) 1413 } 1414 1415 // Create a new AllocRunner to test RestoreState and Run 1416 upd2 := &MockAllocStateUpdater{} 1417 ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc, 1418 ar.vaultClient, ar.consulClient, ar.prevAlloc) 1419 defer ar2.Destroy() 1420 1421 if err := ar2.RestoreState(); err != nil { 1422 t.Fatalf("error restoring state: %v", err) 1423 } 1424 go ar2.Run() 1425 1426 // Wait for tasks to be stopped because leader is dead 1427 testutil.WaitForResult(func() (bool, error) { 1428 _, last := upd2.Last() 1429 if last == nil { 1430 return false, fmt.Errorf("no updates yet") 1431 } 1432 if actual := last.TaskStates["leader"].State; actual != structs.TaskStateDead { 1433 return false, fmt.Errorf("Task leader is not dead yet (it's %q)", actual) 1434 } 1435 if actual := last.TaskStates["follower1"].State; actual != structs.TaskStateDead { 1436 return false, fmt.Errorf("Task follower1 is not dead yet (it's %q)", actual) 1437 } 1438 return true, nil 1439 }, func(err error) { 1440 count, last := upd2.Last() 1441 t.Logf("Updates: %d", count) 1442 for name, state := range last.TaskStates { 1443 t.Logf("%s: %s", name, state.State) 1444 } 1445 t.Fatalf("err: %v", err) 1446 }) 1447 1448 // Make sure it GCs properly 1449 ar2.Destroy() 1450 1451 select { 1452 case <-ar2.WaitCh(): 1453 // exited as expected 1454 case <-time.After(10 * time.Second): 1455 t.Fatalf("timed out waiting for AR to GC") 1456 } 1457 } 1458 1459 // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's 1460 // local/ dir will be moved to a replacement alloc's local/ dir if sticky 1461 // volumes is on. 1462 func TestAllocRunner_MoveAllocDir(t *testing.T) { 1463 t.Parallel() 1464 // Create an alloc runner 1465 alloc := mock.Alloc() 1466 task := alloc.Job.TaskGroups[0].Tasks[0] 1467 task.Driver = "mock_driver" 1468 task.Config = map[string]interface{}{ 1469 "run_for": "1s", 1470 } 1471 upd, ar := testAllocRunnerFromAlloc(t, alloc, false) 1472 go ar.Run() 1473 defer ar.Destroy() 1474 1475 testutil.WaitForResult(func() (bool, error) { 1476 _, last := upd.Last() 1477 if last == nil { 1478 return false, fmt.Errorf("No updates") 1479 } 1480 if last.ClientStatus != structs.AllocClientStatusComplete { 1481 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1482 } 1483 return true, nil 1484 }, func(err error) { 1485 t.Fatalf("err: %v", err) 1486 }) 1487 1488 // Write some data in data dir and task dir of the alloc 1489 dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file") 1490 ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm) 1491 taskDir := ar.allocDir.TaskDirs[task.Name] 1492 taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file") 1493 ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm) 1494 1495 // Create another alloc runner 1496 alloc2 := mock.Alloc() 1497 alloc2.PreviousAllocation = ar.allocID 1498 alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true 1499 task = alloc2.Job.TaskGroups[0].Tasks[0] 1500 task.Driver = "mock_driver" 1501 task.Config = map[string]interface{}{ 1502 "run_for": "1s", 1503 } 1504 upd2, ar2 := testAllocRunnerFromAlloc(t, alloc2, false) 1505 1506 // Set prevAlloc like Client does 1507 ar2.prevAlloc = newAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "") 1508 1509 go ar2.Run() 1510 defer ar2.Destroy() 1511 1512 testutil.WaitForResult(func() (bool, error) { 1513 _, last := upd2.Last() 1514 if last == nil { 1515 return false, fmt.Errorf("No updates") 1516 } 1517 if last.ClientStatus != structs.AllocClientStatusComplete { 1518 return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete) 1519 } 1520 return true, nil 1521 }, func(err error) { 1522 t.Fatalf("err: %v", err) 1523 }) 1524 1525 // Ensure that data from ar was moved to ar2 1526 taskDir = ar2.allocDir.TaskDirs[task.Name] 1527 taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file") 1528 if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil { 1529 t.Fatalf("file %v not found", taskLocalFile) 1530 } 1531 1532 dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file") 1533 if fileInfo, _ := os.Stat(dataFile); fileInfo == nil { 1534 t.Fatalf("file %v not found", dataFile) 1535 } 1536 }