github.com/hernad/nomad@v1.6.112/command/job_restart_test.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package command 5 6 import ( 7 "context" 8 "fmt" 9 "net/http" 10 "net/http/httptest" 11 "net/http/httputil" 12 neturl "net/url" 13 "regexp" 14 "sort" 15 "strings" 16 "sync/atomic" 17 "testing" 18 "time" 19 20 "github.com/google/go-cmp/cmp/cmpopts" 21 "github.com/hashicorp/go-set" 22 "github.com/hernad/nomad/api" 23 "github.com/hernad/nomad/ci" 24 "github.com/hernad/nomad/command/agent" 25 "github.com/hernad/nomad/helper/pointer" 26 "github.com/hernad/nomad/testutil" 27 "github.com/mitchellh/cli" 28 29 "github.com/shoenig/test/must" 30 "github.com/shoenig/test/wait" 31 ) 32 33 func TestJobRestartCommand_Implements(t *testing.T) { 34 ci.Parallel(t) 35 var _ cli.Command = &JobRestartCommand{} 36 } 37 38 func TestJobRestartCommand_parseAndValidate(t *testing.T) { 39 ci.Parallel(t) 40 41 testCases := []struct { 42 name string 43 args []string 44 expectedErr string 45 expectedCmd *JobRestartCommand 46 }{ 47 { 48 name: "missing job", 49 args: []string{}, 50 expectedErr: "This command takes one argument", 51 }, 52 { 53 name: "too many args", 54 args: []string{"one", "two", "three"}, 55 expectedErr: "This command takes one argument", 56 }, 57 { 58 name: "tasks and groups", 59 args: []string{ 60 "-task", "my-task-1", "-task", "my-task-2", 61 "-group", "my-group-1", "-group", "my-group-2", 62 "my-job", 63 }, 64 expectedCmd: &JobRestartCommand{ 65 jobID: "my-job", 66 groups: set.From([]string{"my-group-1", "my-group-2"}), 67 tasks: set.From([]string{"my-task-1", "my-task-2"}), 68 batchSize: 1, 69 }, 70 }, 71 { 72 name: "all tasks", 73 args: []string{"-all-tasks", "my-job"}, 74 expectedCmd: &JobRestartCommand{ 75 jobID: "my-job", 76 allTasks: true, 77 batchSize: 1, 78 }, 79 }, 80 { 81 name: "all tasks conflicts with task", 82 args: []string{"-all-tasks", "-task", "my-task", "-yes", "my-job"}, 83 expectedErr: "The -all-tasks option cannot be used with -task", 84 }, 85 { 86 name: "batch size as number", 87 args: []string{"-batch-size", "10", "my-job"}, 88 expectedCmd: &JobRestartCommand{ 89 jobID: "my-job", 90 batchSize: 10, 91 }, 92 }, 93 { 94 name: "batch size as percentage", 95 args: []string{"-batch-size", "10%", "my-job"}, 96 expectedCmd: &JobRestartCommand{ 97 jobID: "my-job", 98 batchSize: 10, 99 batchSizePercent: true, 100 }, 101 }, 102 { 103 name: "batch size not valid", 104 args: []string{"-batch-size", "not-valid", "my-job"}, 105 expectedErr: "Invalid -batch-size value", 106 }, 107 { 108 name: "batch size decimal not valid", 109 args: []string{"-batch-size", "1.5", "my-job"}, 110 expectedErr: "Invalid -batch-size value", 111 }, 112 { 113 name: "batch size zero", 114 args: []string{"-batch-size", "0", "my-job"}, 115 expectedErr: "Invalid -batch-size value", 116 }, 117 { 118 name: "batch size decimal percent not valid", 119 args: []string{"-batch-size", "1.5%", "my-job"}, 120 expectedErr: "Invalid -batch-size value", 121 }, 122 { 123 name: "batch size zero percentage", 124 args: []string{"-batch-size", "0%", "my-job"}, 125 expectedErr: "Invalid -batch-size value", 126 }, 127 { 128 name: "batch size with multiple numbers and percentages", 129 args: []string{"-batch-size", "15%10%", "my-job"}, 130 expectedErr: "Invalid -batch-size value", 131 }, 132 { 133 name: "batch wait ask", 134 args: []string{"-batch-wait", "ask", "my-job"}, 135 expectedErr: "terminal is not interactive", // Can't test non-interactive. 136 }, 137 { 138 name: "batch wait duration", 139 args: []string{"-batch-wait", "10s", "my-job"}, 140 expectedCmd: &JobRestartCommand{ 141 jobID: "my-job", 142 batchSize: 1, 143 batchWait: 10 * time.Second, 144 }, 145 }, 146 { 147 name: "batch wait invalid", 148 args: []string{"-batch-wait", "10", "my-job"}, 149 expectedErr: "Invalid -batch-wait value", 150 }, 151 { 152 name: "on error fail", 153 args: []string{"-on-error", "fail", "my-job"}, 154 expectedCmd: &JobRestartCommand{ 155 jobID: "my-job", 156 batchSize: 1, 157 onError: jobRestartOnErrorFail, 158 }, 159 }, 160 { 161 name: "on error invalid", 162 args: []string{"-on-error", "invalid", "my-job"}, 163 expectedErr: "Invalid -on-error value", 164 }, 165 { 166 name: "no shutdown delay", 167 args: []string{"-no-shutdown-delay", "my-job"}, 168 expectedCmd: &JobRestartCommand{ 169 jobID: "my-job", 170 batchSize: 1, 171 noShutdownDelay: true, 172 }, 173 }, 174 { 175 name: "reschedule", 176 args: []string{"-reschedule", "my-job"}, 177 expectedCmd: &JobRestartCommand{ 178 jobID: "my-job", 179 batchSize: 1, 180 reschedule: true, 181 }, 182 }, 183 { 184 name: "reschedule conflicts with task", 185 args: []string{"-reschedule", "-task", "my-task", "-yes", "my-job"}, 186 expectedErr: "The -reschedule option cannot be used with -task", 187 }, 188 { 189 name: "verbose", 190 args: []string{"-verbose", "my-job"}, 191 expectedCmd: &JobRestartCommand{ 192 jobID: "my-job", 193 batchSize: 1, 194 verbose: true, 195 length: fullId, 196 }, 197 }, 198 } 199 200 for _, tc := range testCases { 201 t.Run(tc.name, func(t *testing.T) { 202 ui := &cli.ConcurrentUi{Ui: cli.NewMockUi()} 203 meta := Meta{Ui: ui} 204 205 // Set some default values if not defined in test case. 206 if tc.expectedCmd != nil { 207 tc.expectedCmd.Meta = meta 208 209 if tc.expectedCmd.length == 0 { 210 tc.expectedCmd.length = shortId 211 } 212 if tc.expectedCmd.groups == nil { 213 tc.expectedCmd.groups = set.New[string](0) 214 } 215 if tc.expectedCmd.tasks == nil { 216 tc.expectedCmd.tasks = set.New[string](0) 217 } 218 if tc.expectedCmd.onError == "" { 219 tc.expectedCmd.onError = jobRestartOnErrorAsk 220 tc.expectedCmd.autoYes = true 221 tc.args = append([]string{"-yes"}, tc.args...) 222 } 223 } 224 225 cmd := &JobRestartCommand{Meta: meta} 226 code, err := cmd.parseAndValidate(tc.args) 227 228 if tc.expectedErr != "" { 229 must.NonZero(t, code) 230 must.ErrorContains(t, err, tc.expectedErr) 231 } else { 232 must.NoError(t, err) 233 must.Zero(t, code) 234 must.Eq(t, tc.expectedCmd, cmd, must.Cmp(cmpopts.IgnoreFields(JobRestartCommand{}, "Meta", "Meta.Ui"))) 235 } 236 }) 237 } 238 } 239 240 func TestJobRestartCommand_Run(t *testing.T) { 241 ci.Parallel(t) 242 243 // Create a job with multiple tasks, groups, and allocations. 244 prestartTask := api.NewTask("prestart", "mock_driver"). 245 SetConfig("run_for", "100ms"). 246 SetConfig("exit_code", 0). 247 SetLifecycle(&api.TaskLifecycle{ 248 Hook: api.TaskLifecycleHookPrestart, 249 Sidecar: false, 250 }) 251 sidecarTask := api.NewTask("sidecar", "mock_driver"). 252 SetConfig("run_for", "1m"). 253 SetConfig("exit_code", 0). 254 SetLifecycle(&api.TaskLifecycle{ 255 Hook: api.TaskLifecycleHookPoststart, 256 Sidecar: true, 257 }) 258 mainTask := api.NewTask("main", "mock_driver"). 259 SetConfig("run_for", "1m"). 260 SetConfig("exit_code", 0) 261 262 jobID := "test_job_restart_cmd" 263 job := api.NewServiceJob(jobID, jobID, "global", 1). 264 AddDatacenter("dc1"). 265 AddTaskGroup( 266 api.NewTaskGroup("single_task", 3). 267 AddTask(mainTask), 268 ). 269 AddTaskGroup( 270 api.NewTaskGroup("multiple_tasks", 2). 271 AddTask(prestartTask). 272 AddTask(sidecarTask). 273 AddTask(mainTask), 274 ) 275 276 testCases := []struct { 277 name string 278 args []string // Job arg is added automatically. 279 expectedCode int 280 validateFn func(*testing.T, *api.Client, []*api.AllocationListStub, string, string) 281 }{ 282 { 283 name: "restart only running tasks in all groups by default", 284 args: []string{"-batch-size", "100%"}, 285 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 286 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 287 "single_task": { 288 "main": true, 289 }, 290 "multiple_tasks": { 291 "prestart": false, 292 "sidecar": true, 293 "main": true, 294 }, 295 }) 296 297 // Check that allocations restarted in a single batch. 298 batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") 299 must.Len(t, 5, batches[0]) 300 must.StrContains(t, stdout, "Restarting 1st batch") 301 must.StrNotContains(t, stdout, "restarting the next batch") 302 303 }, 304 }, 305 { 306 name: "restart specific task in all groups", 307 args: []string{"-batch-size", "100%", "-task", "main"}, 308 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 309 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 310 "single_task": { 311 "main": true, 312 }, 313 "multiple_tasks": { 314 "prestart": false, 315 "sidecar": false, 316 "main": true, 317 }, 318 }) 319 320 // Check that allocations restarted in a single batch. 321 batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") 322 must.Len(t, 5, batches[0]) 323 must.StrContains(t, stdout, "Restarting 1st batch") 324 must.StrNotContains(t, stdout, "restarting the next batch") 325 }, 326 }, 327 { 328 name: "restart multiple tasks in all groups", 329 args: []string{"-batch-size", "100%", "-task", "main", "-task", "sidecar"}, 330 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 331 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 332 "single_task": { 333 "main": true, 334 }, 335 "multiple_tasks": { 336 "prestart": false, 337 "sidecar": true, 338 "main": true, 339 }, 340 }) 341 342 // Check that allocations restarted in a single batch. 343 batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") 344 must.Len(t, 5, batches[0]) 345 must.StrContains(t, stdout, "Restarting 1st batch") 346 must.StrNotContains(t, stdout, "restarting the next batch") 347 }, 348 }, 349 { 350 name: "restart all tasks in all groups", 351 args: []string{"-batch-size", "100%", "-all-tasks"}, 352 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 353 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 354 "single_task": { 355 "main": true, 356 }, 357 "multiple_tasks": { 358 "prestart": true, 359 "sidecar": true, 360 "main": true, 361 }, 362 }) 363 364 // Check that allocations restarted in a single batch. 365 batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") 366 must.Len(t, 5, batches[0]) 367 must.StrContains(t, stdout, "Restarting 1st batch") 368 must.StrNotContains(t, stdout, "restarting the next batch") 369 }, 370 }, 371 { 372 name: "restart running tasks in specific group", 373 args: []string{"-batch-size", "100%", "-group", "single_task"}, 374 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 375 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 376 "single_task": { 377 "main": true, 378 }, 379 "multiple_tasks": { 380 "prestart": false, 381 "sidecar": false, 382 "main": false, 383 }, 384 }) 385 386 // Check that allocations restarted in a single batch. 387 batches := getRestartBatches(restarted, []string{"single_task"}, "main") 388 must.Len(t, 3, batches[0]) 389 must.StrContains(t, stdout, "Restarting 1st batch") 390 must.StrNotContains(t, stdout, "restarting the next batch") 391 392 }, 393 }, 394 { 395 name: "restart specific task that is not running", 396 args: []string{"-batch-size", "100%", "-task", "prestart"}, 397 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 398 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 399 "single_task": { 400 "main": false, 401 }, 402 "multiple_tasks": { 403 "prestart": false, 404 "sidecar": false, 405 "main": false, 406 }, 407 }) 408 409 // Check that allocations restarted in a single batch. 410 batches := getRestartBatches(restarted, []string{"single_task"}, "main") 411 must.Len(t, 3, batches[0]) 412 must.StrContains(t, stdout, "Restarting 1st batch") 413 must.StrNotContains(t, stdout, "restarting the next batch") 414 415 // Check that we have an error message. 416 must.StrContains(t, stderr, "Task not running") 417 }, 418 expectedCode: 1, 419 }, 420 { 421 name: "restart specific task in specific group", 422 args: []string{"-batch-size", "100%", "-task", "main", "-group", "single_task"}, 423 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 424 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 425 "single_task": { 426 "main": true, 427 }, 428 "multiple_tasks": { 429 "prestart": false, 430 "sidecar": false, 431 "main": false, 432 }, 433 }) 434 435 // Check that allocations restarted in a single batch. 436 batches := getRestartBatches(restarted, []string{"single_task"}, "main") 437 must.Len(t, 3, batches[0]) 438 must.StrContains(t, stdout, "Restarting 1st batch") 439 must.StrNotContains(t, stdout, "restarting the next batch") 440 }, 441 }, 442 { 443 name: "restart multiple tasks in specific group", 444 args: []string{"-batch-size", "100%", "-task", "main", "-task", "sidecar", "-group", "multiple_tasks"}, 445 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 446 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 447 "single_task": { 448 "main": false, 449 }, 450 "multiple_tasks": { 451 "prestart": false, 452 "sidecar": true, 453 "main": true, 454 }, 455 }) 456 457 // Check that allocations restarted in a single batch. 458 batches := getRestartBatches(restarted, []string{"multiple_tasks"}, "main") 459 must.Len(t, 2, batches[0]) 460 must.StrContains(t, stdout, "Restarting 1st batch") 461 must.StrNotContains(t, stdout, "restarting the next batch") 462 }, 463 }, 464 { 465 name: "restart all tasks in specific group", 466 args: []string{"-batch-size", "100%", "-all-tasks", "-group", "multiple_tasks"}, 467 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 468 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 469 "single_task": { 470 "main": false, 471 }, 472 "multiple_tasks": { 473 "prestart": true, 474 "sidecar": true, 475 "main": true, 476 }, 477 }) 478 479 // Check that allocations restarted in a single batch. 480 batches := getRestartBatches(restarted, []string{"multiple_tasks"}, "main") 481 must.Len(t, 2, batches[0]) 482 must.StrContains(t, stdout, "Restarting 1st batch") 483 must.StrNotContains(t, stdout, "restarting the next batch") 484 }, 485 }, 486 { 487 name: "restart in batches", 488 args: []string{"-batch-size", "3", "-batch-wait", "3s", "-task", "main"}, 489 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 490 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 491 "single_task": { 492 "main": true, 493 }, 494 "multiple_tasks": { 495 "prestart": false, 496 "sidecar": false, 497 "main": true, 498 }, 499 }) 500 501 // Check that allocations were properly batched. 502 batches := getRestartBatches(restarted, []string{"multiple_tasks", "single_task"}, "main") 503 504 must.Len(t, 3, batches[0]) 505 must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations") 506 507 must.Len(t, 2, batches[1]) 508 must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations") 509 510 // Check that we only waited between batches. 511 waitMsgCount := strings.Count(stdout, "Waiting 3s before restarting the next batch") 512 must.Eq(t, 1, waitMsgCount) 513 514 // Check that batches waited the expected time. 515 batch1Restart := batches[0][0].TaskStates["main"].LastRestart 516 batch2Restart := batches[1][0].TaskStates["main"].LastRestart 517 diff := batch2Restart.Sub(batch1Restart) 518 must.Between(t, 3*time.Second, diff, 4*time.Second) 519 }, 520 }, 521 { 522 name: "restart in percent batch", 523 args: []string{"-batch-size", "50%", "-batch-wait", "3s", "-task", "main"}, 524 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 525 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 526 "single_task": { 527 "main": true, 528 }, 529 "multiple_tasks": { 530 "prestart": false, 531 "sidecar": false, 532 "main": true, 533 }, 534 }) 535 536 // Check that allocations were properly batched. 537 batches := getRestartBatches(restarted, []string{"multiple_tasks", "single_task"}, "main") 538 539 must.Len(t, 3, batches[0]) 540 must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations") 541 542 must.Len(t, 2, batches[1]) 543 must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations") 544 545 // Check that we only waited between batches. 546 waitMsgCount := strings.Count(stdout, "Waiting 3s before restarting the next batch") 547 must.Eq(t, 1, waitMsgCount) 548 549 // Check that batches waited the expected time. 550 batch1Restart := batches[0][0].TaskStates["main"].LastRestart 551 batch2Restart := batches[1][0].TaskStates["main"].LastRestart 552 diff := batch2Restart.Sub(batch1Restart) 553 must.Between(t, 3*time.Second, diff, 4*time.Second) 554 }, 555 }, 556 { 557 name: "restart in batch ask with yes", 558 args: []string{"-batch-size", "100%", "-batch-wait", "ask", "-yes", "-group", "single_task"}, 559 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 560 restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ 561 "single_task": { 562 "main": true, 563 }, 564 "multiple_tasks": { 565 "prestart": false, 566 "sidecar": false, 567 "main": false, 568 }, 569 }) 570 571 // Check that allocations restarted in a single batch. 572 batches := getRestartBatches(restarted, []string{"single_task"}, "main") 573 must.Len(t, 3, batches[0]) 574 must.StrContains(t, stdout, "Restarting 1st batch") 575 must.StrNotContains(t, stdout, "restarting the next batch") 576 }, 577 }, 578 { 579 name: "reschedule in batches", 580 args: []string{"-reschedule", "-batch-size", "3"}, 581 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 582 // Expect all allocations were rescheduled. 583 reschedules := map[string]bool{} 584 for _, alloc := range allocs { 585 reschedules[alloc.ID] = true 586 } 587 waitAllocsRescheduled(t, client, reschedules) 588 589 // Check that allocations were properly batched. 590 must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations") 591 must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations") 592 must.StrNotContains(t, stdout, "Waiting") 593 }, 594 }, 595 { 596 name: "reschedule specific group", 597 args: []string{"-reschedule", "-batch-size", "100%", "-group", "single_task"}, 598 validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { 599 // Expect that only allocs for the single_task group were 600 // rescheduled. 601 reschedules := map[string]bool{} 602 for _, alloc := range allocs { 603 if alloc.TaskGroup == "single_task" { 604 reschedules[alloc.ID] = true 605 } 606 } 607 waitAllocsRescheduled(t, client, reschedules) 608 609 // Check that allocations restarted in a single batch. 610 must.StrContains(t, stdout, "Restarting 1st batch") 611 must.StrNotContains(t, stdout, "restarting the next batch") 612 }, 613 }, 614 } 615 616 for _, tc := range testCases { 617 tc := tc 618 t.Run(tc.name, func(t *testing.T) { 619 // Run each test case in parallel because they are fairly slow. 620 ci.Parallel(t) 621 622 // Initialize UI and command. 623 ui := cli.NewMockUi() 624 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 625 626 // Start client and server and wait for node to be ready. 627 // User separate cluster for each test case so they can run in 628 // parallel without affecting each other. 629 srv, client, url := testServer(t, true, nil) 630 defer srv.Shutdown() 631 632 waitForNodes(t, client) 633 634 // Register test job and wait for its allocs to be running. 635 resp, _, err := client.Jobs().Register(job, nil) 636 must.NoError(t, err) 637 638 code := waitForSuccess(ui, client, fullId, t, resp.EvalID) 639 must.Zero(t, code) 640 641 allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil) 642 must.NoError(t, err) 643 for _, alloc := range allocStubs { 644 waitForAllocRunning(t, client, alloc.ID) 645 } 646 647 // Fetch allocations before the restart so we know which ones are 648 // supposed to be affected in case the test reschedules allocs. 649 allocStubs, _, err = client.Jobs().Allocations(jobID, true, nil) 650 must.NoError(t, err) 651 652 // Prepend server URL and append job ID to the test case command. 653 args := []string{"-address", url, "-yes"} 654 args = append(args, tc.args...) 655 args = append(args, jobID) 656 657 // Run job restart command. 658 code = cmd.Run(args) 659 must.Eq(t, code, tc.expectedCode) 660 661 // Run test case validation function. 662 if tc.validateFn != nil { 663 tc.validateFn(t, client, allocStubs, ui.OutputWriter.String(), ui.ErrorWriter.String()) 664 } 665 }) 666 } 667 } 668 669 func TestJobRestartCommand_jobPrefixAndNamespace(t *testing.T) { 670 ci.Parallel(t) 671 672 ui := cli.NewMockUi() 673 674 // Start client and server and wait for node to be ready. 675 srv, client, url := testServer(t, true, nil) 676 defer srv.Shutdown() 677 678 waitForNodes(t, client) 679 680 // Create non-default namespace. 681 _, err := client.Namespaces().Register(&api.Namespace{Name: "prod"}, nil) 682 must.NoError(t, err) 683 684 // Register job with same name in both namespaces. 685 evalIDs := []string{} 686 687 jobDefault := testJob("test_job_restart") 688 resp, _, err := client.Jobs().Register(jobDefault, nil) 689 must.NoError(t, err) 690 evalIDs = append(evalIDs, resp.EvalID) 691 692 jobProd := testJob("test_job_restart") 693 jobProd.Namespace = pointer.Of("prod") 694 resp, _, err = client.Jobs().Register(jobProd, nil) 695 must.NoError(t, err) 696 evalIDs = append(evalIDs, resp.EvalID) 697 698 jobUniqueProd := testJob("test_job_restart_prod_ns") 699 jobUniqueProd.Namespace = pointer.Of("prod") 700 resp, _, err = client.Jobs().Register(jobUniqueProd, nil) 701 must.NoError(t, err) 702 evalIDs = append(evalIDs, resp.EvalID) 703 704 // Wait for evals to be processed. 705 for _, evalID := range evalIDs { 706 code := waitForSuccess(ui, client, fullId, t, evalID) 707 must.Eq(t, 0, code) 708 } 709 ui.OutputWriter.Reset() 710 711 testCases := []struct { 712 name string 713 args []string 714 expectedErr string 715 }{ 716 { 717 name: "prefix match in default namespace", 718 args: []string{"test_job"}, 719 }, 720 { 721 name: "invalid job", 722 args: []string{"not-valid"}, 723 expectedErr: "No job(s) with prefix or ID", 724 }, 725 { 726 name: "prefix matches multiple jobs", 727 args: []string{"-namespace", "prod", "test_job"}, 728 expectedErr: "matched multiple jobs", 729 }, 730 { 731 name: "prefix matches multiple jobs across namespaces", 732 args: []string{"-namespace", "*", "test_job"}, 733 expectedErr: "matched multiple jobs", 734 }, 735 { 736 name: "unique prefix match across namespaces", 737 args: []string{"-namespace", "*", "test_job_restart_prod"}, 738 }, 739 } 740 741 for _, tc := range testCases { 742 t.Run(tc.name, func(t *testing.T) { 743 defer func() { 744 ui.OutputWriter.Reset() 745 ui.ErrorWriter.Reset() 746 }() 747 748 cmd := &JobRestartCommand{ 749 Meta: Meta{Ui: &cli.ConcurrentUi{Ui: ui}}, 750 } 751 args := append([]string{"-address", url, "-yes"}, tc.args...) 752 code := cmd.Run(args) 753 754 if tc.expectedErr != "" { 755 must.NonZero(t, code) 756 must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr) 757 } else { 758 must.Zero(t, code) 759 } 760 }) 761 } 762 } 763 764 func TestJobRestartCommand_noAllocs(t *testing.T) { 765 ci.Parallel(t) 766 767 ui := cli.NewMockUi() 768 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 769 770 // Start client and server and wait for node to be ready. 771 srv, client, url := testServer(t, true, nil) 772 defer srv.Shutdown() 773 774 waitForNodes(t, client) 775 776 // Register test job with impossible constraint so it doesn't get allocs. 777 jobID := "test_job_restart_no_allocs" 778 job := testJob(jobID) 779 job.Datacenters = []string{"invalid"} 780 781 resp, _, err := client.Jobs().Register(job, nil) 782 must.NoError(t, err) 783 784 code := waitForSuccess(ui, client, fullId, t, resp.EvalID) 785 must.Eq(t, 2, code) // Placement is expected to fail so exit code is not 0. 786 ui.OutputWriter.Reset() 787 788 // Run job restart command and expect it to exit without restarts. 789 code = cmd.Run([]string{ 790 "-address", url, 791 "-yes", 792 jobID, 793 }) 794 must.Zero(t, code) 795 must.StrContains(t, ui.OutputWriter.String(), "No allocations to restart") 796 } 797 798 func TestJobRestartCommand_rescheduleFail(t *testing.T) { 799 ci.Parallel(t) 800 801 ui := cli.NewMockUi() 802 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 803 804 // Start client and server and wait for node to be ready. 805 srv, client, url := testServer(t, true, nil) 806 defer srv.Shutdown() 807 808 waitForNodes(t, client) 809 810 // Register test job with 3 allocs. 811 jobID := "test_job_restart_reschedule_fail" 812 job := testJob(jobID) 813 job.TaskGroups[0].Count = pointer.Of(3) 814 815 resp, _, err := client.Jobs().Register(job, nil) 816 must.NoError(t, err) 817 818 code := waitForSuccess(ui, client, fullId, t, resp.EvalID) 819 must.Zero(t, code) 820 ui.OutputWriter.Reset() 821 822 // Wait for allocs to be running. 823 allocs, _, err := client.Jobs().Allocations(jobID, true, nil) 824 must.NoError(t, err) 825 for _, alloc := range allocs { 826 waitForAllocRunning(t, client, alloc.ID) 827 } 828 829 // Mark node as ineligible to prevent allocs from being replaced. 830 nodeID := srv.Agent.Client().NodeID() 831 client.Nodes().ToggleEligibility(nodeID, false, nil) 832 833 // Run job restart command and expect it to fail. 834 code = cmd.Run([]string{ 835 "-address", url, 836 "-batch-size", "2", 837 "-reschedule", 838 "-yes", 839 jobID, 840 }) 841 must.One(t, code) 842 must.StrContains(t, ui.ErrorWriter.String(), "No nodes were eligible for evaluation") 843 } 844 845 func TestJobRestartCommand_monitorReplacementAlloc(t *testing.T) { 846 ci.Parallel(t) 847 848 ui := cli.NewMockUi() 849 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 850 851 srv, client, _ := testServer(t, true, nil) 852 defer srv.Shutdown() 853 waitForNodes(t, client) 854 855 // Register test job and update it twice so we end up with three 856 // allocations, one replacing the next one. 857 jobID := "test_job_restart_monitor_replacement" 858 job := testJob(jobID) 859 860 for i := 1; i <= 3; i++ { 861 job.TaskGroups[0].Tasks[0].Config["run_for"] = fmt.Sprintf("%ds", i) 862 resp, _, err := client.Jobs().Register(job, nil) 863 must.NoError(t, err) 864 865 code := waitForSuccess(ui, client, fullId, t, resp.EvalID) 866 must.Zero(t, code) 867 } 868 ui.OutputWriter.Reset() 869 870 // Prepare the command internals. We want to run a specific function and 871 // target a specific allocation, so we can't run the full command. 872 cmd.client = client 873 cmd.verbose = true 874 cmd.length = fullId 875 876 // Fetch, sort, and monitor the oldest allocation. 877 allocs, _, err := client.Jobs().Allocations(jobID, true, nil) 878 must.NoError(t, err) 879 sort.Slice(allocs, func(i, j int) bool { 880 return allocs[i].CreateIndex < allocs[j].CreateIndex 881 }) 882 883 errCh := make(chan error) 884 go cmd.monitorReplacementAlloc(context.Background(), AllocationListStubWithJob{ 885 AllocationListStub: allocs[0], 886 Job: job, 887 }, errCh) 888 889 // Make sure the command doesn't get stuck and that we traverse the 890 // follow-up allocations properly. 891 must.Wait(t, wait.InitialSuccess( 892 wait.ErrorFunc(func() error { 893 select { 894 case err := <-errCh: 895 return err 896 default: 897 return fmt.Errorf("waiting for response") 898 } 899 }), 900 wait.Timeout(time.Duration(testutil.TestMultiplier()*3)*time.Second), 901 )) 902 must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q replaced by %q", allocs[0].ID, allocs[1].ID)) 903 must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q replaced by %q", allocs[1].ID, allocs[2].ID)) 904 must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q is %q", allocs[2].ID, api.AllocClientStatusRunning)) 905 } 906 907 func TestJobRestartCommand_activeDeployment(t *testing.T) { 908 ci.Parallel(t) 909 910 srv, client, url := testServer(t, true, nil) 911 defer srv.Shutdown() 912 waitForNodes(t, client) 913 914 // Register test job and update it once to trigger a deployment. 915 jobID := "test_job_restart_deployment" 916 job := testJob(jobID) 917 job.Type = pointer.Of(api.JobTypeService) 918 job.Update = &api.UpdateStrategy{ 919 Canary: pointer.Of(1), 920 AutoPromote: pointer.Of(false), 921 } 922 923 _, _, err := client.Jobs().Register(job, nil) 924 must.NoError(t, err) 925 926 _, _, err = client.Jobs().Register(job, nil) 927 must.NoError(t, err) 928 929 // Wait for a deployment to be running. 930 must.Wait(t, wait.InitialSuccess( 931 wait.ErrorFunc(func() error { 932 deployments, _, err := client.Jobs().Deployments(jobID, true, nil) 933 if err != nil { 934 return err 935 } 936 for _, d := range deployments { 937 if d.Status == api.DeploymentStatusRunning { 938 return nil 939 } 940 } 941 return fmt.Errorf("no running deployments") 942 }), 943 wait.Timeout(time.Duration(testutil.TestMultiplier()*3)*time.Second), 944 )) 945 946 // Run job restart command and expect it to fail. 947 ui := cli.NewMockUi() 948 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 949 950 code := cmd.Run([]string{ 951 "-address", url, 952 "-on-error", jobRestartOnErrorFail, 953 "-verbose", 954 jobID, 955 }) 956 must.One(t, code) 957 must.RegexMatch(t, regexp.MustCompile(`Deployment .+ is "running"`), ui.ErrorWriter.String()) 958 } 959 960 func TestJobRestartCommand_ACL(t *testing.T) { 961 ci.Parallel(t) 962 963 // Start server with ACL enabled. 964 srv, client, url := testServer(t, true, func(c *agent.Config) { 965 c.ACL.Enabled = true 966 }) 967 defer srv.Shutdown() 968 969 rootTokenOpts := &api.WriteOptions{ 970 AuthToken: srv.RootToken.SecretID, 971 } 972 973 // Register test job. 974 jobID := "test_job_restart_acl" 975 job := testJob(jobID) 976 _, _, err := client.Jobs().Register(job, rootTokenOpts) 977 must.NoError(t, err) 978 979 // Wait for allocs to be running. 980 waitForJobAllocsStatus(t, client, jobID, api.AllocClientStatusRunning, srv.RootToken.SecretID) 981 982 testCases := []struct { 983 name string 984 jobPrefix bool 985 aclPolicy string 986 expectedErr string 987 }{ 988 { 989 name: "no token", 990 aclPolicy: "", 991 expectedErr: api.PermissionDeniedErrorContent, 992 }, 993 { 994 name: "alloc-lifecycle not enough", 995 aclPolicy: ` 996 namespace "default" { 997 capabilities = ["alloc-lifecycle"] 998 } 999 `, 1000 expectedErr: api.PermissionDeniedErrorContent, 1001 }, 1002 { 1003 name: "read-job not enough", 1004 aclPolicy: ` 1005 namespace "default" { 1006 capabilities = ["read-job"] 1007 } 1008 `, 1009 expectedErr: api.PermissionDeniedErrorContent, 1010 }, 1011 { 1012 name: "alloc-lifecycle and read-job allowed", 1013 aclPolicy: ` 1014 namespace "default" { 1015 capabilities = ["alloc-lifecycle", "read-job"] 1016 } 1017 `, 1018 }, 1019 { 1020 name: "job prefix requires list-jobs", 1021 aclPolicy: ` 1022 namespace "default" { 1023 capabilities = ["alloc-lifecycle", "read-job"] 1024 } 1025 `, 1026 jobPrefix: true, 1027 expectedErr: "job not found", 1028 }, 1029 { 1030 name: "job prefix works with list-jobs", 1031 aclPolicy: ` 1032 namespace "default" { 1033 capabilities = ["list-jobs", "alloc-lifecycle", "read-job"] 1034 } 1035 `, 1036 jobPrefix: true, 1037 }, 1038 } 1039 1040 for _, tc := range testCases { 1041 t.Run(tc.name, func(t *testing.T) { 1042 ui := cli.NewMockUi() 1043 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 1044 args := []string{ 1045 "-address", url, 1046 "-yes", 1047 } 1048 1049 if tc.aclPolicy != "" { 1050 // Create ACL token with test case policy. 1051 policy := &api.ACLPolicy{ 1052 Name: nonAlphaNum.ReplaceAllString(tc.name, "-"), 1053 Rules: tc.aclPolicy, 1054 } 1055 _, err := client.ACLPolicies().Upsert(policy, rootTokenOpts) 1056 must.NoError(t, err) 1057 1058 token := &api.ACLToken{ 1059 Type: "client", 1060 Policies: []string{policy.Name}, 1061 } 1062 token, _, err = client.ACLTokens().Create(token, rootTokenOpts) 1063 must.NoError(t, err) 1064 1065 // Set token in command args. 1066 args = append(args, "-token", token.SecretID) 1067 } 1068 1069 // Add job ID or job ID prefix to the command. 1070 if tc.jobPrefix { 1071 args = append(args, jobID[0:3]) 1072 } else { 1073 args = append(args, jobID) 1074 } 1075 1076 // Run command. 1077 code := cmd.Run(args) 1078 if tc.expectedErr == "" { 1079 must.Zero(t, code) 1080 } else { 1081 must.One(t, code) 1082 must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr) 1083 } 1084 }) 1085 } 1086 } 1087 1088 // TODO(luiz): update once alloc restart supports -no-shutdown-delay. 1089 func TestJobRestartCommand_shutdownDelay_reschedule(t *testing.T) { 1090 ci.Parallel(t) 1091 1092 // Start client and server and wait for node to be ready. 1093 srv, client, url := testServer(t, true, nil) 1094 defer srv.Shutdown() 1095 1096 waitForNodes(t, client) 1097 1098 testCases := []struct { 1099 name string 1100 args []string 1101 shutdownDelay bool 1102 }{ 1103 { 1104 name: "job reschedule with shutdown delay by default", 1105 args: []string{"-reschedule"}, 1106 shutdownDelay: true, 1107 }, 1108 { 1109 name: "job reschedule no shutdown delay", 1110 args: []string{"-reschedule", "-no-shutdown-delay"}, 1111 shutdownDelay: false, 1112 }, 1113 } 1114 1115 for _, tc := range testCases { 1116 t.Run(tc.name, func(t *testing.T) { 1117 ui := cli.NewMockUi() 1118 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 1119 1120 // Register job with 2 allocations and shutdown_delay. 1121 shutdownDelay := 3 * time.Second 1122 jobID := nonAlphaNum.ReplaceAllString(tc.name, "-") 1123 1124 job := testJob(jobID) 1125 job.TaskGroups[0].Count = pointer.Of(2) 1126 job.TaskGroups[0].Tasks[0].Config["run_for"] = "10m" 1127 job.TaskGroups[0].Tasks[0].ShutdownDelay = shutdownDelay 1128 job.TaskGroups[0].Tasks[0].Services = []*api.Service{{ 1129 Name: "service", 1130 Provider: "nomad", 1131 }} 1132 1133 resp, _, err := client.Jobs().Register(job, nil) 1134 must.NoError(t, err) 1135 1136 code := waitForSuccess(ui, client, fullId, t, resp.EvalID) 1137 must.Zero(t, code) 1138 ui.OutputWriter.Reset() 1139 1140 // Wait for alloc to be running. 1141 allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil) 1142 must.NoError(t, err) 1143 for _, alloc := range allocStubs { 1144 waitForAllocRunning(t, client, alloc.ID) 1145 } 1146 1147 // Add address and job ID to the command and run. 1148 args := []string{ 1149 "-address", url, 1150 "-batch-size", "1", 1151 "-batch-wait", "0", 1152 "-yes", 1153 } 1154 args = append(args, tc.args...) 1155 args = append(args, jobID) 1156 1157 code = cmd.Run(args) 1158 must.Zero(t, code) 1159 1160 // Wait for all allocs to restart. 1161 reschedules := map[string]bool{} 1162 for _, alloc := range allocStubs { 1163 reschedules[alloc.ID] = true 1164 } 1165 allocs := waitAllocsRescheduled(t, client, reschedules) 1166 1167 // Check that allocs have shutdown delay event. 1168 for _, alloc := range allocs { 1169 for _, s := range alloc.TaskStates { 1170 var killedEv *api.TaskEvent 1171 var killingEv *api.TaskEvent 1172 for _, ev := range s.Events { 1173 if strings.Contains(ev.Type, "Killed") { 1174 killedEv = ev 1175 } 1176 if strings.Contains(ev.Type, "Killing") { 1177 killingEv = ev 1178 } 1179 } 1180 1181 diff := killedEv.Time - killingEv.Time 1182 if tc.shutdownDelay { 1183 must.GreaterEq(t, shutdownDelay, time.Duration(diff)) 1184 } else { 1185 // Add a bit of slack to account for the actual 1186 // shutdown time of the task. 1187 must.Between(t, shutdownDelay, time.Duration(diff), shutdownDelay+time.Second) 1188 } 1189 } 1190 } 1191 }) 1192 } 1193 } 1194 1195 func TestJobRestartCommand_filterAllocs(t *testing.T) { 1196 ci.Parallel(t) 1197 1198 task1 := api.NewTask("task_1", "mock_driver") 1199 task2 := api.NewTask("task_2", "mock_driver") 1200 task3 := api.NewTask("task_3", "mock_driver") 1201 1202 jobV1 := api.NewServiceJob("example", "example", "global", 1). 1203 AddTaskGroup( 1204 api.NewTaskGroup("group_1", 1). 1205 AddTask(task1), 1206 ). 1207 AddTaskGroup( 1208 api.NewTaskGroup("group_2", 1). 1209 AddTask(task1). 1210 AddTask(task2), 1211 ). 1212 AddTaskGroup( 1213 api.NewTaskGroup("group_3", 1). 1214 AddTask(task3), 1215 ) 1216 jobV1.Version = pointer.Of(uint64(1)) 1217 1218 jobV2 := api.NewServiceJob("example", "example", "global", 1). 1219 AddTaskGroup( 1220 api.NewTaskGroup("group_1", 1). 1221 AddTask(task1), 1222 ). 1223 AddTaskGroup( 1224 api.NewTaskGroup("group_2", 1). 1225 AddTask(task2), 1226 ) 1227 jobV2.Version = pointer.Of(uint64(2)) 1228 1229 allAllocs := []AllocationListStubWithJob{} 1230 allocs := map[string]AllocationListStubWithJob{} 1231 for _, job := range []*api.Job{jobV1, jobV2} { 1232 for _, tg := range job.TaskGroups { 1233 for _, desired := range []string{api.AllocDesiredStatusRun, api.AllocDesiredStatusStop} { 1234 for _, client := range []string{api.AllocClientStatusRunning, api.AllocClientStatusComplete} { 1235 key := fmt.Sprintf("job_v%d_%s_%s_%s", *job.Version, *tg.Name, desired, client) 1236 alloc := AllocationListStubWithJob{ 1237 AllocationListStub: &api.AllocationListStub{ 1238 ID: key, 1239 JobVersion: *job.Version, 1240 TaskGroup: *tg.Name, 1241 DesiredStatus: desired, 1242 ClientStatus: client, 1243 }, 1244 Job: job, 1245 } 1246 allocs[key] = alloc 1247 allAllocs = append(allAllocs, alloc) 1248 } 1249 } 1250 } 1251 } 1252 1253 testCases := []struct { 1254 name string 1255 args []string 1256 expectedAllocs []AllocationListStubWithJob 1257 }{ 1258 { 1259 name: "skip by group", 1260 args: []string{"-group", "group_1"}, 1261 expectedAllocs: []AllocationListStubWithJob{ 1262 allocs["job_v1_group_1_run_running"], 1263 allocs["job_v1_group_1_run_complete"], 1264 allocs["job_v1_group_1_stop_running"], 1265 allocs["job_v2_group_1_run_running"], 1266 allocs["job_v2_group_1_run_complete"], 1267 allocs["job_v2_group_1_stop_running"], 1268 }, 1269 }, 1270 { 1271 name: "skip by old group", 1272 args: []string{"-group", "group_3"}, 1273 expectedAllocs: []AllocationListStubWithJob{ 1274 allocs["job_v1_group_3_run_running"], 1275 allocs["job_v1_group_3_run_complete"], 1276 allocs["job_v1_group_3_stop_running"], 1277 }, 1278 }, 1279 { 1280 name: "skip by task", 1281 args: []string{"-task", "task_2"}, 1282 expectedAllocs: []AllocationListStubWithJob{ 1283 allocs["job_v1_group_2_run_running"], 1284 allocs["job_v1_group_2_run_complete"], 1285 allocs["job_v1_group_2_stop_running"], 1286 allocs["job_v2_group_2_run_running"], 1287 allocs["job_v2_group_2_run_complete"], 1288 allocs["job_v2_group_2_stop_running"], 1289 }, 1290 }, 1291 { 1292 name: "skip by old task", 1293 args: []string{"-task", "task_3"}, 1294 expectedAllocs: []AllocationListStubWithJob{ 1295 allocs["job_v1_group_3_run_running"], 1296 allocs["job_v1_group_3_run_complete"], 1297 allocs["job_v1_group_3_stop_running"], 1298 }, 1299 }, 1300 { 1301 name: "skip by group and task", 1302 args: []string{ 1303 "-group", "group_1", 1304 "-group", "group_2", 1305 "-task", "task_2", 1306 }, 1307 // Only group_2 has task_2 in all job versions. 1308 expectedAllocs: []AllocationListStubWithJob{ 1309 allocs["job_v1_group_2_run_running"], 1310 allocs["job_v1_group_2_run_complete"], 1311 allocs["job_v1_group_2_stop_running"], 1312 allocs["job_v2_group_2_run_running"], 1313 allocs["job_v2_group_2_run_complete"], 1314 allocs["job_v2_group_2_stop_running"], 1315 }, 1316 }, 1317 { 1318 name: "skip by status", 1319 args: []string{}, 1320 expectedAllocs: []AllocationListStubWithJob{ 1321 allocs["job_v1_group_1_run_running"], 1322 allocs["job_v1_group_1_run_complete"], 1323 allocs["job_v1_group_1_stop_running"], 1324 allocs["job_v1_group_2_run_running"], 1325 allocs["job_v1_group_2_run_complete"], 1326 allocs["job_v1_group_2_stop_running"], 1327 allocs["job_v1_group_3_run_running"], 1328 allocs["job_v1_group_3_run_complete"], 1329 allocs["job_v1_group_3_stop_running"], 1330 allocs["job_v2_group_1_run_running"], 1331 allocs["job_v2_group_1_run_complete"], 1332 allocs["job_v2_group_1_stop_running"], 1333 allocs["job_v2_group_2_run_running"], 1334 allocs["job_v2_group_2_run_complete"], 1335 allocs["job_v2_group_2_stop_running"], 1336 }, 1337 }, 1338 { 1339 name: "no matches by group", 1340 args: []string{"-group", "group_404"}, 1341 expectedAllocs: []AllocationListStubWithJob{}, 1342 }, 1343 { 1344 name: "no matches by task", 1345 args: []string{"-task", "task_404"}, 1346 expectedAllocs: []AllocationListStubWithJob{}, 1347 }, 1348 { 1349 name: "no matches by task with group", 1350 args: []string{ 1351 "-group", "group_1", 1352 "-task", "task_2", // group_1 never has task_2. 1353 }, 1354 expectedAllocs: []AllocationListStubWithJob{}, 1355 }, 1356 } 1357 1358 for _, tc := range testCases { 1359 t.Run(tc.name, func(t *testing.T) { 1360 ui := cli.NewMockUi() 1361 cmd := &JobRestartCommand{ 1362 Meta: Meta{Ui: &cli.ConcurrentUi{Ui: ui}}, 1363 } 1364 1365 args := append(tc.args, "-verbose", "-yes", "example") 1366 code, err := cmd.parseAndValidate(args) 1367 must.NoError(t, err) 1368 must.Zero(t, code) 1369 1370 got := cmd.filterAllocs(allAllocs) 1371 must.SliceEqFunc(t, tc.expectedAllocs, got, func(a, b AllocationListStubWithJob) bool { 1372 return a.ID == b.ID 1373 }) 1374 1375 expected := set.FromFunc(tc.expectedAllocs, func(a AllocationListStubWithJob) string { 1376 return a.ID 1377 }) 1378 for _, a := range allAllocs { 1379 if !expected.Contains(a.ID) { 1380 must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("Skipping allocation %q", a.ID)) 1381 } 1382 } 1383 }) 1384 } 1385 } 1386 1387 func TestJobRestartCommand_onErrorFail(t *testing.T) { 1388 ci.Parallel(t) 1389 1390 ui := cli.NewMockUi() 1391 cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} 1392 1393 // Start client and server and wait for node to be ready. 1394 srv, client, url := testServer(t, true, nil) 1395 defer srv.Shutdown() 1396 1397 parsedURL, err := neturl.Parse(url) 1398 must.NoError(t, err) 1399 1400 waitForNodes(t, client) 1401 1402 // Register a job with 3 allocations. 1403 jobID := "test_job_restart_command_fail_on_error" 1404 job := testJob(jobID) 1405 job.TaskGroups[0].Count = pointer.Of(3) 1406 1407 resp, _, err := client.Jobs().Register(job, nil) 1408 must.NoError(t, err) 1409 1410 code := waitForSuccess(ui, client, fullId, t, resp.EvalID) 1411 must.Zero(t, code) 1412 ui.OutputWriter.Reset() 1413 1414 // Create a proxy to inject an error after 2 allocation restarts. 1415 // Also counts how many restart requests are made so we can check that the 1416 // command stops after the error happens. 1417 var allocRestarts int32 1418 proxy := httptest.NewServer(&httputil.ReverseProxy{ 1419 ModifyResponse: func(resp *http.Response) error { 1420 if strings.HasSuffix(resp.Request.URL.Path, "/restart") { 1421 count := atomic.AddInt32(&allocRestarts, 1) 1422 if count == 2 { 1423 return fmt.Errorf("fail") 1424 } 1425 } 1426 return nil 1427 }, 1428 Rewrite: func(r *httputil.ProxyRequest) { 1429 r.SetURL(parsedURL) 1430 }, 1431 }) 1432 defer proxy.Close() 1433 1434 // Run command with -fail-on-error. 1435 // Expect only 2 restarts requests even though there are 3 allocations. 1436 code = cmd.Run([]string{ 1437 "-address", proxy.URL, 1438 "-on-error", jobRestartOnErrorFail, 1439 jobID, 1440 }) 1441 must.One(t, code) 1442 must.Eq(t, 2, allocRestarts) 1443 } 1444 1445 // waitTasksRestarted blocks until the given allocations have restarted or not. 1446 // Returns a list with updated state of the allocations. 1447 // 1448 // To determine if a restart happened the function looks for a "Restart 1449 // Signaled" event in the list of task events. Allocations that are reused 1450 // between tests may contain a restart event from a past test case, leading to 1451 // false positives. 1452 // 1453 // The restarts map contains values structured as group:task:<expect restart?>. 1454 func waitTasksRestarted( 1455 t *testing.T, 1456 client *api.Client, 1457 allocs []*api.AllocationListStub, 1458 restarts map[string]map[string]bool, 1459 ) []*api.Allocation { 1460 t.Helper() 1461 1462 var newAllocs []*api.Allocation 1463 testutil.WaitForResult(func() (bool, error) { 1464 newAllocs = make([]*api.Allocation, 0, len(allocs)) 1465 1466 for _, alloc := range allocs { 1467 if _, ok := restarts[alloc.TaskGroup]; !ok { 1468 t.Fatalf("Missing group %q in restarts map", alloc.TaskGroup) 1469 } 1470 1471 // Skip allocations that are not supposed to be running. 1472 if alloc.DesiredStatus != api.AllocDesiredStatusRun { 1473 continue 1474 } 1475 1476 updated, _, err := client.Allocations().Info(alloc.ID, nil) 1477 if err != nil { 1478 return false, err 1479 } 1480 newAllocs = append(newAllocs, updated) 1481 1482 for task, state := range updated.TaskStates { 1483 restarted := false 1484 for _, ev := range state.Events { 1485 if ev.Type == api.TaskRestartSignal { 1486 restarted = true 1487 break 1488 } 1489 } 1490 1491 if restarted && !restarts[updated.TaskGroup][task] { 1492 return false, fmt.Errorf( 1493 "task %q in alloc %s for group %q not expected to restart", 1494 task, updated.ID, updated.TaskGroup, 1495 ) 1496 } 1497 if !restarted && restarts[updated.TaskGroup][task] { 1498 return false, fmt.Errorf( 1499 "task %q in alloc %s for group %q expected to restart but didn't", 1500 task, updated.ID, updated.TaskGroup, 1501 ) 1502 } 1503 } 1504 } 1505 return true, nil 1506 }, func(err error) { 1507 must.NoError(t, err) 1508 }) 1509 1510 return newAllocs 1511 } 1512 1513 // waitAllocsRescheduled blocks until the given allocations have been 1514 // rescueduled or not. Returns a list with updated state of the allocations. 1515 // 1516 // To determined if an allocation has been rescheduled the function looks for 1517 // a non-empty NextAllocation field. 1518 // 1519 // The reschedules map maps allocation IDs to a boolean indicating if a 1520 // reschedule is expected for that allocation. 1521 func waitAllocsRescheduled(t *testing.T, client *api.Client, reschedules map[string]bool) []*api.Allocation { 1522 t.Helper() 1523 1524 var newAllocs []*api.Allocation 1525 testutil.WaitForResult(func() (bool, error) { 1526 newAllocs = make([]*api.Allocation, 0, len(reschedules)) 1527 1528 for allocID, reschedule := range reschedules { 1529 alloc, _, err := client.Allocations().Info(allocID, nil) 1530 if err != nil { 1531 return false, err 1532 } 1533 newAllocs = append(newAllocs, alloc) 1534 1535 wasRescheduled := alloc.NextAllocation != "" 1536 if wasRescheduled && !reschedule { 1537 return false, fmt.Errorf("alloc %s not expected to be rescheduled", alloc.ID) 1538 } 1539 if !wasRescheduled && reschedule { 1540 return false, fmt.Errorf("alloc %s expected to be rescheduled but wasn't", alloc.ID) 1541 } 1542 } 1543 return true, nil 1544 }, func(err error) { 1545 must.NoError(t, err) 1546 }) 1547 1548 return newAllocs 1549 } 1550 1551 // getRestartBatches returns a list of allocations per batch of restarts. 1552 // 1553 // Since restarts are issued concurrently, it's expected that allocations in 1554 // the same batch have fairly close LastRestart times, so a 1s delay between 1555 // restarts may be enough to indicate a new batch. 1556 func getRestartBatches(allocs []*api.Allocation, groups []string, task string) [][]*api.Allocation { 1557 groupsSet := set.From(groups) 1558 batches := [][]*api.Allocation{} 1559 1560 type allocRestart struct { 1561 alloc *api.Allocation 1562 restart time.Time 1563 } 1564 1565 restarts := make([]allocRestart, 0, len(allocs)) 1566 for _, alloc := range allocs { 1567 if !groupsSet.Contains(alloc.TaskGroup) { 1568 continue 1569 } 1570 1571 restarts = append(restarts, allocRestart{ 1572 alloc: alloc, 1573 restart: alloc.TaskStates[task].LastRestart, 1574 }) 1575 } 1576 1577 sort.Slice(restarts, func(i, j int) bool { 1578 return restarts[i].restart.Before(restarts[j].restart) 1579 }) 1580 1581 prev := restarts[0].restart 1582 batch := []*api.Allocation{} 1583 for _, r := range restarts { 1584 if r.restart.Sub(prev) >= time.Second { 1585 prev = r.restart 1586 batches = append(batches, batch) 1587 batch = []*api.Allocation{} 1588 } 1589 batch = append(batch, r.alloc) 1590 } 1591 batches = append(batches, batch) 1592 1593 return batches 1594 }