github.com/djenriquez/nomad-1@v0.8.1/nomad/drainer/watch_jobs_test.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "testing" 6 "time" 7 8 "github.com/hashicorp/nomad/helper" 9 "github.com/hashicorp/nomad/helper/testlog" 10 "github.com/hashicorp/nomad/nomad/mock" 11 "github.com/hashicorp/nomad/nomad/state" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/stretchr/testify/assert" 14 "github.com/stretchr/testify/require" 15 "golang.org/x/time/rate" 16 ) 17 18 func testNodes(t *testing.T, state *state.StateStore) (drainingNode, runningNode *structs.Node) { 19 n1 := mock.Node() 20 n1.Name = "draining" 21 n1.DrainStrategy = &structs.DrainStrategy{ 22 DrainSpec: structs.DrainSpec{ 23 Deadline: time.Minute, 24 }, 25 ForceDeadline: time.Now().Add(time.Minute), 26 } 27 require.Nil(t, state.UpsertNode(100, n1)) 28 29 // Create a non-draining node 30 n2 := mock.Node() 31 n2.Name = "running" 32 require.Nil(t, state.UpsertNode(101, n2)) 33 return n1, n2 34 } 35 36 func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJobWatcher, context.CancelFunc) { 37 t.Helper() 38 39 limiter := rate.NewLimiter(100.0, 100) 40 logger := testlog.Logger(t) 41 ctx, cancel := context.WithCancel(context.Background()) 42 w := NewDrainingJobWatcher(ctx, limiter, state, logger) 43 return w, cancel 44 } 45 46 // TestDrainingJobWatcher_Interface is a compile-time assertion that we 47 // implement the intended interface. 48 func TestDrainingJobWatcher_Interface(t *testing.T) { 49 w, cancel := testDrainingJobWatcher(t, state.TestStateStore(t)) 50 cancel() 51 var _ DrainingJobWatcher = w 52 } 53 54 // asertJobWatcherOps asserts a certain number of allocs are drained and/or 55 // migrated by the job watcher. 56 func assertJobWatcherOps(t *testing.T, jw DrainingJobWatcher, drained, migrated int) ( 57 *DrainRequest, []*structs.Allocation) { 58 t.Helper() 59 var ( 60 drains *DrainRequest 61 migrations []*structs.Allocation 62 drainsChecked, migrationsChecked bool 63 ) 64 for { 65 select { 66 case drains = <-jw.Drain(): 67 ids := make([]string, len(drains.Allocs)) 68 for i, a := range drains.Allocs { 69 ids[i] = a.JobID[:6] + ":" + a.ID[:6] 70 } 71 t.Logf("draining %d allocs: %v", len(ids), ids) 72 require.False(t, drainsChecked, "drains already received") 73 drainsChecked = true 74 require.Lenf(t, drains.Allocs, drained, 75 "expected %d drains but found %d", drained, len(drains.Allocs)) 76 case migrations = <-jw.Migrated(): 77 ids := make([]string, len(migrations)) 78 for i, a := range migrations { 79 ids[i] = a.JobID[:6] + ":" + a.ID[:6] 80 } 81 t.Logf("migrating %d allocs: %v", len(ids), ids) 82 require.False(t, migrationsChecked, "migrations already received") 83 migrationsChecked = true 84 require.Lenf(t, migrations, migrated, 85 "expected %d migrations but found %d", migrated, len(migrations)) 86 case <-time.After(10 * time.Millisecond): 87 if !drainsChecked && drained > 0 { 88 t.Fatalf("expected %d drains but none happened", drained) 89 } 90 if !migrationsChecked && migrated > 0 { 91 t.Fatalf("expected %d migrations but none happened", migrated) 92 } 93 return drains, migrations 94 } 95 } 96 } 97 98 // TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches 99 // allocation changes from multiple jobs. 100 func TestDrainingJobWatcher_DrainJobs(t *testing.T) { 101 t.Parallel() 102 require := require.New(t) 103 104 state := state.TestStateStore(t) 105 jobWatcher, cancelWatcher := testDrainingJobWatcher(t, state) 106 defer cancelWatcher() 107 drainingNode, runningNode := testNodes(t, state) 108 109 var index uint64 = 101 110 count := 8 111 112 newAlloc := func(node *structs.Node, job *structs.Job) *structs.Allocation { 113 a := mock.Alloc() 114 a.JobID = job.ID 115 a.Job = job 116 a.TaskGroup = job.TaskGroups[0].Name 117 a.NodeID = node.ID 118 return a 119 } 120 121 // 2 jobs with count 10, max parallel 3 122 jnss := make([]structs.NamespacedID, 2) 123 jobs := make([]*structs.Job, 2) 124 for i := 0; i < 2; i++ { 125 job := mock.Job() 126 jobs[i] = job 127 jnss[i] = structs.NamespacedID{Namespace: job.Namespace, ID: job.ID} 128 job.TaskGroups[0].Migrate.MaxParallel = 3 129 job.TaskGroups[0].Count = count 130 require.Nil(state.UpsertJob(index, job)) 131 index++ 132 133 var allocs []*structs.Allocation 134 for i := 0; i < count; i++ { 135 a := newAlloc(drainingNode, job) 136 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 137 Healthy: helper.BoolToPtr(true), 138 } 139 allocs = append(allocs, a) 140 } 141 142 require.Nil(state.UpsertAllocs(index, allocs)) 143 index++ 144 145 } 146 147 // Only register jobs with watcher after creating all data models as 148 // once the watcher starts we need to track the index carefully for 149 // updating the batch future 150 jobWatcher.RegisterJobs(jnss) 151 152 // Expect a first batch of MaxParallel allocs from each job 153 drains, _ := assertJobWatcherOps(t, jobWatcher, 6, 0) 154 155 // Fake migrating the drained allocs by starting new ones and stopping 156 // the old ones 157 drainedAllocs := make([]*structs.Allocation, len(drains.Allocs)) 158 for i, a := range drains.Allocs { 159 a.DesiredTransition.Migrate = helper.BoolToPtr(true) 160 161 // create a copy so we can reuse this slice 162 drainedAllocs[i] = a.Copy() 163 } 164 require.Nil(state.UpsertAllocs(index, drainedAllocs)) 165 drains.Resp.Respond(index, nil) 166 index++ 167 168 // Just setting ShouldMigrate should not cause any further drains 169 assertJobWatcherOps(t, jobWatcher, 0, 0) 170 171 // Proceed our fake migration along by creating new allocs and stopping 172 // old ones 173 replacements := make([]*structs.Allocation, len(drainedAllocs)) 174 updates := make([]*structs.Allocation, 0, len(drainedAllocs)*2) 175 for i, a := range drainedAllocs { 176 // Stop drained allocs 177 a.DesiredTransition.Migrate = nil 178 a.DesiredStatus = structs.AllocDesiredStatusStop 179 180 // Create a replacement 181 replacement := mock.Alloc() 182 replacement.JobID = a.Job.ID 183 replacement.Job = a.Job 184 replacement.TaskGroup = a.TaskGroup 185 replacement.NodeID = runningNode.ID 186 // start in pending state with no health status 187 188 updates = append(updates, a, replacement) 189 replacements[i] = replacement.Copy() 190 } 191 require.Nil(state.UpsertAllocs(index, updates)) 192 index++ 193 194 // The drained allocs stopping cause migrations but no new drains 195 // because the replacements have not started 196 assertJobWatcherOps(t, jobWatcher, 0, 6) 197 198 // Finally kickoff further drain activity by "starting" replacements 199 for _, a := range replacements { 200 a.ClientStatus = structs.AllocClientStatusRunning 201 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 202 Healthy: helper.BoolToPtr(true), 203 } 204 } 205 require.Nil(state.UpsertAllocs(index, replacements)) 206 index++ 207 208 require.NotEmpty(jobWatcher.drainingJobs()) 209 210 // 6 new drains 211 drains, _ = assertJobWatcherOps(t, jobWatcher, 6, 0) 212 213 // Fake migrations once more to finish the drain 214 drainedAllocs = make([]*structs.Allocation, len(drains.Allocs)) 215 for i, a := range drains.Allocs { 216 a.DesiredTransition.Migrate = helper.BoolToPtr(true) 217 218 // create a copy so we can reuse this slice 219 drainedAllocs[i] = a.Copy() 220 } 221 require.Nil(state.UpsertAllocs(index, drainedAllocs)) 222 drains.Resp.Respond(index, nil) 223 index++ 224 225 assertJobWatcherOps(t, jobWatcher, 0, 0) 226 227 replacements = make([]*structs.Allocation, len(drainedAllocs)) 228 updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2) 229 for i, a := range drainedAllocs { 230 a.DesiredTransition.Migrate = nil 231 a.DesiredStatus = structs.AllocDesiredStatusStop 232 233 replacement := newAlloc(runningNode, a.Job) 234 updates = append(updates, a, replacement) 235 replacements[i] = replacement.Copy() 236 } 237 require.Nil(state.UpsertAllocs(index, updates)) 238 index++ 239 240 assertJobWatcherOps(t, jobWatcher, 0, 6) 241 242 for _, a := range replacements { 243 a.ClientStatus = structs.AllocClientStatusRunning 244 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 245 Healthy: helper.BoolToPtr(true), 246 } 247 } 248 require.Nil(state.UpsertAllocs(index, replacements)) 249 index++ 250 251 require.NotEmpty(jobWatcher.drainingJobs()) 252 253 // Final 4 new drains 254 drains, _ = assertJobWatcherOps(t, jobWatcher, 4, 0) 255 256 // Fake migrations once more to finish the drain 257 drainedAllocs = make([]*structs.Allocation, len(drains.Allocs)) 258 for i, a := range drains.Allocs { 259 a.DesiredTransition.Migrate = helper.BoolToPtr(true) 260 261 // create a copy so we can reuse this slice 262 drainedAllocs[i] = a.Copy() 263 } 264 require.Nil(state.UpsertAllocs(index, drainedAllocs)) 265 drains.Resp.Respond(index, nil) 266 index++ 267 268 assertJobWatcherOps(t, jobWatcher, 0, 0) 269 270 replacements = make([]*structs.Allocation, len(drainedAllocs)) 271 updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2) 272 for i, a := range drainedAllocs { 273 a.DesiredTransition.Migrate = nil 274 a.DesiredStatus = structs.AllocDesiredStatusStop 275 276 replacement := newAlloc(runningNode, a.Job) 277 updates = append(updates, a, replacement) 278 replacements[i] = replacement.Copy() 279 } 280 require.Nil(state.UpsertAllocs(index, updates)) 281 index++ 282 283 assertJobWatcherOps(t, jobWatcher, 0, 4) 284 285 for _, a := range replacements { 286 a.ClientStatus = structs.AllocClientStatusRunning 287 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 288 Healthy: helper.BoolToPtr(true), 289 } 290 } 291 require.Nil(state.UpsertAllocs(index, replacements)) 292 293 // No jobs should be left! 294 require.Empty(jobWatcher.drainingJobs()) 295 } 296 297 // DrainingJobWatcher tests: 298 // TODO Test that the watcher cancels its query when a new job is registered 299 300 // handleTaskGroupTestCase is the test case struct for TestHandleTaskGroup 301 // 302 // Two nodes will be initialized: one draining and one running. 303 type handleTaskGroupTestCase struct { 304 // Name of test 305 Name string 306 307 // Batch uses a batch job and alloc 308 Batch bool 309 310 // Expectations 311 ExpectedDrained int 312 ExpectedMigrated int 313 ExpectedDone bool 314 315 // Count overrides the default count of 10 if set 316 Count int 317 318 // MaxParallel overrides the default max_parallel of 1 if set 319 MaxParallel int 320 321 // AddAlloc will be called 10 times to create test allocs 322 // 323 // Allocs default to be healthy on the draining node 324 AddAlloc func(i int, a *structs.Allocation, drainingID, runningID string) 325 } 326 327 func TestHandeTaskGroup_Table(t *testing.T) { 328 cases := []handleTaskGroupTestCase{ 329 { 330 // All allocs on draining node 331 Name: "AllDraining", 332 ExpectedDrained: 1, 333 ExpectedMigrated: 0, 334 ExpectedDone: false, 335 }, 336 { 337 // All allocs on non-draining node 338 Name: "AllNonDraining", 339 ExpectedDrained: 0, 340 ExpectedMigrated: 0, 341 ExpectedDone: true, 342 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 343 a.NodeID = runningID 344 }, 345 }, 346 { 347 // Some allocs on non-draining node but not healthy 348 Name: "SomeNonDrainingUnhealthy", 349 ExpectedDrained: 0, 350 ExpectedMigrated: 0, 351 ExpectedDone: false, 352 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 353 if i%2 == 0 { 354 a.NodeID = runningID 355 a.DeploymentStatus = nil 356 } 357 }, 358 }, 359 { 360 // One draining, other allocs on non-draining node and healthy 361 Name: "OneDraining", 362 ExpectedDrained: 1, 363 ExpectedMigrated: 0, 364 ExpectedDone: false, 365 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 366 if i != 0 { 367 a.NodeID = runningID 368 } 369 }, 370 }, 371 { 372 // One already draining, other allocs on non-draining node and healthy 373 Name: "OneAlreadyDraining", 374 ExpectedDrained: 0, 375 ExpectedMigrated: 0, 376 ExpectedDone: false, 377 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 378 if i == 0 { 379 a.DesiredTransition.Migrate = helper.BoolToPtr(true) 380 return 381 } 382 a.NodeID = runningID 383 }, 384 }, 385 { 386 // One already drained, other allocs on non-draining node and healthy 387 Name: "OneAlreadyDrained", 388 ExpectedDrained: 0, 389 ExpectedMigrated: 1, 390 ExpectedDone: true, 391 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 392 if i == 0 { 393 a.DesiredStatus = structs.AllocDesiredStatusStop 394 return 395 } 396 a.NodeID = runningID 397 }, 398 }, 399 { 400 // One already drained, other allocs on non-draining node and healthy 401 Name: "OneAlreadyDrainedBatched", 402 Batch: true, 403 ExpectedDrained: 0, 404 ExpectedMigrated: 1, 405 ExpectedDone: true, 406 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 407 if i == 0 { 408 a.DesiredStatus = structs.AllocDesiredStatusStop 409 return 410 } 411 a.NodeID = runningID 412 }, 413 }, 414 { 415 // All allocs are terminl, nothing to be drained 416 Name: "AllMigrating", 417 ExpectedDrained: 0, 418 ExpectedMigrated: 10, 419 ExpectedDone: true, 420 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 421 a.DesiredStatus = structs.AllocDesiredStatusStop 422 }, 423 }, 424 { 425 // All allocs are terminl, nothing to be drained 426 Name: "AllMigratingBatch", 427 Batch: true, 428 ExpectedDrained: 0, 429 ExpectedMigrated: 10, 430 ExpectedDone: true, 431 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 432 a.DesiredStatus = structs.AllocDesiredStatusStop 433 }, 434 }, 435 { 436 // All allocs may be drained at once 437 Name: "AllAtOnce", 438 ExpectedDrained: 10, 439 ExpectedMigrated: 0, 440 ExpectedDone: false, 441 MaxParallel: 10, 442 }, 443 { 444 // Drain 2 445 Name: "Drain2", 446 ExpectedDrained: 2, 447 ExpectedMigrated: 0, 448 ExpectedDone: false, 449 MaxParallel: 2, 450 }, 451 { 452 // One on new node, one drained, and one draining 453 ExpectedDrained: 1, 454 ExpectedMigrated: 1, 455 MaxParallel: 2, 456 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 457 switch i { 458 case 0: 459 // One alloc on running node 460 a.NodeID = runningID 461 case 1: 462 // One alloc already migrated 463 a.DesiredStatus = structs.AllocDesiredStatusStop 464 } 465 }, 466 }, 467 { 468 // 8 on new node, one drained, and one draining 469 ExpectedDrained: 1, 470 ExpectedMigrated: 1, 471 MaxParallel: 2, 472 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 473 switch i { 474 case 0, 1, 2, 3, 4, 5, 6, 7: 475 a.NodeID = runningID 476 case 8: 477 a.DesiredStatus = structs.AllocDesiredStatusStop 478 } 479 }, 480 }, 481 { 482 // 5 on new node, two drained, and three draining 483 ExpectedDrained: 3, 484 ExpectedMigrated: 2, 485 MaxParallel: 5, 486 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 487 switch i { 488 case 0, 1, 2, 3, 4: 489 a.NodeID = runningID 490 case 8, 9: 491 a.DesiredStatus = structs.AllocDesiredStatusStop 492 } 493 }, 494 }, 495 { 496 // Not all on new node have health set 497 Name: "PendingHealth", 498 ExpectedDrained: 1, 499 ExpectedMigrated: 1, 500 MaxParallel: 3, 501 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 502 switch i { 503 case 0: 504 // Deployment status UNset for 1 on new node 505 a.NodeID = runningID 506 a.DeploymentStatus = nil 507 case 1, 2, 3, 4: 508 // Deployment status set for 4 on new node 509 a.NodeID = runningID 510 case 9: 511 a.DesiredStatus = structs.AllocDesiredStatusStop 512 } 513 }, 514 }, 515 { 516 // 5 max parallel - 1 migrating - 2 with unset health = 2 drainable 517 Name: "PendingHealthHigherMax", 518 ExpectedDrained: 2, 519 ExpectedMigrated: 1, 520 MaxParallel: 5, 521 AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { 522 switch i { 523 case 0, 1: 524 // Deployment status UNset for 2 on new node 525 a.NodeID = runningID 526 a.DeploymentStatus = nil 527 case 2, 3, 4: 528 // Deployment status set for 3 on new node 529 a.NodeID = runningID 530 case 9: 531 a.DesiredStatus = structs.AllocDesiredStatusStop 532 } 533 }, 534 }, 535 } 536 537 for _, testCase := range cases { 538 t.Run(testCase.Name, func(t *testing.T) { 539 testHandleTaskGroup(t, testCase) 540 }) 541 } 542 } 543 544 func testHandleTaskGroup(t *testing.T, tc handleTaskGroupTestCase) { 545 t.Parallel() 546 require := require.New(t) 547 assert := assert.New(t) 548 549 // Create nodes 550 state := state.TestStateStore(t) 551 drainingNode, runningNode := testNodes(t, state) 552 553 job := mock.Job() 554 if tc.Batch { 555 job = mock.BatchJob() 556 } 557 job.TaskGroups[0].Count = 10 558 if tc.Count > 0 { 559 job.TaskGroups[0].Count = tc.Count 560 } 561 if tc.MaxParallel > 0 { 562 job.TaskGroups[0].Migrate.MaxParallel = tc.MaxParallel 563 } 564 require.Nil(state.UpsertJob(102, job)) 565 566 var allocs []*structs.Allocation 567 for i := 0; i < 10; i++ { 568 a := mock.Alloc() 569 if tc.Batch { 570 a = mock.BatchAlloc() 571 } 572 a.JobID = job.ID 573 a.Job = job 574 a.TaskGroup = job.TaskGroups[0].Name 575 576 // Default to being healthy on the draining node 577 a.NodeID = drainingNode.ID 578 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 579 Healthy: helper.BoolToPtr(true), 580 } 581 if tc.AddAlloc != nil { 582 tc.AddAlloc(i, a, drainingNode.ID, runningNode.ID) 583 } 584 allocs = append(allocs, a) 585 } 586 587 require.Nil(state.UpsertAllocs(103, allocs)) 588 snap, err := state.Snapshot() 589 require.Nil(err) 590 591 res := newJobResult() 592 require.Nil(handleTaskGroup(snap, tc.Batch, job.TaskGroups[0], allocs, 102, res)) 593 assert.Lenf(res.drain, tc.ExpectedDrained, "Drain expected %d but found: %d", 594 tc.ExpectedDrained, len(res.drain)) 595 assert.Lenf(res.migrated, tc.ExpectedMigrated, "Migrate expected %d but found: %d", 596 tc.ExpectedMigrated, len(res.migrated)) 597 assert.Equal(tc.ExpectedDone, res.done) 598 } 599 600 func TestHandleTaskGroup_Migrations(t *testing.T) { 601 t.Parallel() 602 require := require.New(t) 603 604 // Create a draining node 605 state := state.TestStateStore(t) 606 n := mock.Node() 607 n.DrainStrategy = &structs.DrainStrategy{ 608 DrainSpec: structs.DrainSpec{ 609 Deadline: 5 * time.Minute, 610 }, 611 ForceDeadline: time.Now().Add(1 * time.Minute), 612 } 613 require.Nil(state.UpsertNode(100, n)) 614 615 job := mock.Job() 616 require.Nil(state.UpsertJob(101, job)) 617 618 // Create 10 done allocs 619 var allocs []*structs.Allocation 620 for i := 0; i < 10; i++ { 621 a := mock.Alloc() 622 a.Job = job 623 a.TaskGroup = job.TaskGroups[0].Name 624 a.NodeID = n.ID 625 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 626 Healthy: helper.BoolToPtr(false), 627 } 628 629 if i%2 == 0 { 630 a.DesiredStatus = structs.AllocDesiredStatusStop 631 } else { 632 a.ClientStatus = structs.AllocClientStatusFailed 633 } 634 allocs = append(allocs, a) 635 } 636 require.Nil(state.UpsertAllocs(102, allocs)) 637 638 snap, err := state.Snapshot() 639 require.Nil(err) 640 641 // Handle before and after indexes as both service and batch 642 res := newJobResult() 643 require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res)) 644 require.Empty(res.drain) 645 require.Len(res.migrated, 10) 646 require.True(res.done) 647 648 res = newJobResult() 649 require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res)) 650 require.Empty(res.drain) 651 require.Len(res.migrated, 10) 652 require.True(res.done) 653 654 res = newJobResult() 655 require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res)) 656 require.Empty(res.drain) 657 require.Empty(res.migrated) 658 require.True(res.done) 659 660 res = newJobResult() 661 require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res)) 662 require.Empty(res.drain) 663 require.Empty(res.migrated) 664 require.True(res.done) 665 }