github.com/hernad/nomad@v1.6.112/command/job_restart_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package command
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"net/http"
    10  	"net/http/httptest"
    11  	"net/http/httputil"
    12  	neturl "net/url"
    13  	"regexp"
    14  	"sort"
    15  	"strings"
    16  	"sync/atomic"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/google/go-cmp/cmp/cmpopts"
    21  	"github.com/hashicorp/go-set"
    22  	"github.com/hernad/nomad/api"
    23  	"github.com/hernad/nomad/ci"
    24  	"github.com/hernad/nomad/command/agent"
    25  	"github.com/hernad/nomad/helper/pointer"
    26  	"github.com/hernad/nomad/testutil"
    27  	"github.com/mitchellh/cli"
    28  
    29  	"github.com/shoenig/test/must"
    30  	"github.com/shoenig/test/wait"
    31  )
    32  
    33  func TestJobRestartCommand_Implements(t *testing.T) {
    34  	ci.Parallel(t)
    35  	var _ cli.Command = &JobRestartCommand{}
    36  }
    37  
    38  func TestJobRestartCommand_parseAndValidate(t *testing.T) {
    39  	ci.Parallel(t)
    40  
    41  	testCases := []struct {
    42  		name        string
    43  		args        []string
    44  		expectedErr string
    45  		expectedCmd *JobRestartCommand
    46  	}{
    47  		{
    48  			name:        "missing job",
    49  			args:        []string{},
    50  			expectedErr: "This command takes one argument",
    51  		},
    52  		{
    53  			name:        "too many args",
    54  			args:        []string{"one", "two", "three"},
    55  			expectedErr: "This command takes one argument",
    56  		},
    57  		{
    58  			name: "tasks and groups",
    59  			args: []string{
    60  				"-task", "my-task-1", "-task", "my-task-2",
    61  				"-group", "my-group-1", "-group", "my-group-2",
    62  				"my-job",
    63  			},
    64  			expectedCmd: &JobRestartCommand{
    65  				jobID:     "my-job",
    66  				groups:    set.From([]string{"my-group-1", "my-group-2"}),
    67  				tasks:     set.From([]string{"my-task-1", "my-task-2"}),
    68  				batchSize: 1,
    69  			},
    70  		},
    71  		{
    72  			name: "all tasks",
    73  			args: []string{"-all-tasks", "my-job"},
    74  			expectedCmd: &JobRestartCommand{
    75  				jobID:     "my-job",
    76  				allTasks:  true,
    77  				batchSize: 1,
    78  			},
    79  		},
    80  		{
    81  			name:        "all tasks conflicts with task",
    82  			args:        []string{"-all-tasks", "-task", "my-task", "-yes", "my-job"},
    83  			expectedErr: "The -all-tasks option cannot be used with -task",
    84  		},
    85  		{
    86  			name: "batch size as number",
    87  			args: []string{"-batch-size", "10", "my-job"},
    88  			expectedCmd: &JobRestartCommand{
    89  				jobID:     "my-job",
    90  				batchSize: 10,
    91  			},
    92  		},
    93  		{
    94  			name: "batch size as percentage",
    95  			args: []string{"-batch-size", "10%", "my-job"},
    96  			expectedCmd: &JobRestartCommand{
    97  				jobID:            "my-job",
    98  				batchSize:        10,
    99  				batchSizePercent: true,
   100  			},
   101  		},
   102  		{
   103  			name:        "batch size not valid",
   104  			args:        []string{"-batch-size", "not-valid", "my-job"},
   105  			expectedErr: "Invalid -batch-size value",
   106  		},
   107  		{
   108  			name:        "batch size decimal not valid",
   109  			args:        []string{"-batch-size", "1.5", "my-job"},
   110  			expectedErr: "Invalid -batch-size value",
   111  		},
   112  		{
   113  			name:        "batch size zero",
   114  			args:        []string{"-batch-size", "0", "my-job"},
   115  			expectedErr: "Invalid -batch-size value",
   116  		},
   117  		{
   118  			name:        "batch size decimal percent not valid",
   119  			args:        []string{"-batch-size", "1.5%", "my-job"},
   120  			expectedErr: "Invalid -batch-size value",
   121  		},
   122  		{
   123  			name:        "batch size zero percentage",
   124  			args:        []string{"-batch-size", "0%", "my-job"},
   125  			expectedErr: "Invalid -batch-size value",
   126  		},
   127  		{
   128  			name:        "batch size with multiple numbers and percentages",
   129  			args:        []string{"-batch-size", "15%10%", "my-job"},
   130  			expectedErr: "Invalid -batch-size value",
   131  		},
   132  		{
   133  			name:        "batch wait ask",
   134  			args:        []string{"-batch-wait", "ask", "my-job"},
   135  			expectedErr: "terminal is not interactive", // Can't test non-interactive.
   136  		},
   137  		{
   138  			name: "batch wait duration",
   139  			args: []string{"-batch-wait", "10s", "my-job"},
   140  			expectedCmd: &JobRestartCommand{
   141  				jobID:     "my-job",
   142  				batchSize: 1,
   143  				batchWait: 10 * time.Second,
   144  			},
   145  		},
   146  		{
   147  			name:        "batch wait invalid",
   148  			args:        []string{"-batch-wait", "10", "my-job"},
   149  			expectedErr: "Invalid -batch-wait value",
   150  		},
   151  		{
   152  			name: "on error fail",
   153  			args: []string{"-on-error", "fail", "my-job"},
   154  			expectedCmd: &JobRestartCommand{
   155  				jobID:     "my-job",
   156  				batchSize: 1,
   157  				onError:   jobRestartOnErrorFail,
   158  			},
   159  		},
   160  		{
   161  			name:        "on error invalid",
   162  			args:        []string{"-on-error", "invalid", "my-job"},
   163  			expectedErr: "Invalid -on-error value",
   164  		},
   165  		{
   166  			name: "no shutdown delay",
   167  			args: []string{"-no-shutdown-delay", "my-job"},
   168  			expectedCmd: &JobRestartCommand{
   169  				jobID:           "my-job",
   170  				batchSize:       1,
   171  				noShutdownDelay: true,
   172  			},
   173  		},
   174  		{
   175  			name: "reschedule",
   176  			args: []string{"-reschedule", "my-job"},
   177  			expectedCmd: &JobRestartCommand{
   178  				jobID:      "my-job",
   179  				batchSize:  1,
   180  				reschedule: true,
   181  			},
   182  		},
   183  		{
   184  			name:        "reschedule conflicts with task",
   185  			args:        []string{"-reschedule", "-task", "my-task", "-yes", "my-job"},
   186  			expectedErr: "The -reschedule option cannot be used with -task",
   187  		},
   188  		{
   189  			name: "verbose",
   190  			args: []string{"-verbose", "my-job"},
   191  			expectedCmd: &JobRestartCommand{
   192  				jobID:     "my-job",
   193  				batchSize: 1,
   194  				verbose:   true,
   195  				length:    fullId,
   196  			},
   197  		},
   198  	}
   199  
   200  	for _, tc := range testCases {
   201  		t.Run(tc.name, func(t *testing.T) {
   202  			ui := &cli.ConcurrentUi{Ui: cli.NewMockUi()}
   203  			meta := Meta{Ui: ui}
   204  
   205  			// Set some default values if not defined in test case.
   206  			if tc.expectedCmd != nil {
   207  				tc.expectedCmd.Meta = meta
   208  
   209  				if tc.expectedCmd.length == 0 {
   210  					tc.expectedCmd.length = shortId
   211  				}
   212  				if tc.expectedCmd.groups == nil {
   213  					tc.expectedCmd.groups = set.New[string](0)
   214  				}
   215  				if tc.expectedCmd.tasks == nil {
   216  					tc.expectedCmd.tasks = set.New[string](0)
   217  				}
   218  				if tc.expectedCmd.onError == "" {
   219  					tc.expectedCmd.onError = jobRestartOnErrorAsk
   220  					tc.expectedCmd.autoYes = true
   221  					tc.args = append([]string{"-yes"}, tc.args...)
   222  				}
   223  			}
   224  
   225  			cmd := &JobRestartCommand{Meta: meta}
   226  			code, err := cmd.parseAndValidate(tc.args)
   227  
   228  			if tc.expectedErr != "" {
   229  				must.NonZero(t, code)
   230  				must.ErrorContains(t, err, tc.expectedErr)
   231  			} else {
   232  				must.NoError(t, err)
   233  				must.Zero(t, code)
   234  				must.Eq(t, tc.expectedCmd, cmd, must.Cmp(cmpopts.IgnoreFields(JobRestartCommand{}, "Meta", "Meta.Ui")))
   235  			}
   236  		})
   237  	}
   238  }
   239  
   240  func TestJobRestartCommand_Run(t *testing.T) {
   241  	ci.Parallel(t)
   242  
   243  	// Create a job with multiple tasks, groups, and allocations.
   244  	prestartTask := api.NewTask("prestart", "mock_driver").
   245  		SetConfig("run_for", "100ms").
   246  		SetConfig("exit_code", 0).
   247  		SetLifecycle(&api.TaskLifecycle{
   248  			Hook:    api.TaskLifecycleHookPrestart,
   249  			Sidecar: false,
   250  		})
   251  	sidecarTask := api.NewTask("sidecar", "mock_driver").
   252  		SetConfig("run_for", "1m").
   253  		SetConfig("exit_code", 0).
   254  		SetLifecycle(&api.TaskLifecycle{
   255  			Hook:    api.TaskLifecycleHookPoststart,
   256  			Sidecar: true,
   257  		})
   258  	mainTask := api.NewTask("main", "mock_driver").
   259  		SetConfig("run_for", "1m").
   260  		SetConfig("exit_code", 0)
   261  
   262  	jobID := "test_job_restart_cmd"
   263  	job := api.NewServiceJob(jobID, jobID, "global", 1).
   264  		AddDatacenter("dc1").
   265  		AddTaskGroup(
   266  			api.NewTaskGroup("single_task", 3).
   267  				AddTask(mainTask),
   268  		).
   269  		AddTaskGroup(
   270  			api.NewTaskGroup("multiple_tasks", 2).
   271  				AddTask(prestartTask).
   272  				AddTask(sidecarTask).
   273  				AddTask(mainTask),
   274  		)
   275  
   276  	testCases := []struct {
   277  		name         string
   278  		args         []string // Job arg is added automatically.
   279  		expectedCode int
   280  		validateFn   func(*testing.T, *api.Client, []*api.AllocationListStub, string, string)
   281  	}{
   282  		{
   283  			name: "restart only running tasks in all groups by default",
   284  			args: []string{"-batch-size", "100%"},
   285  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   286  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   287  					"single_task": {
   288  						"main": true,
   289  					},
   290  					"multiple_tasks": {
   291  						"prestart": false,
   292  						"sidecar":  true,
   293  						"main":     true,
   294  					},
   295  				})
   296  
   297  				// Check that allocations restarted in a single batch.
   298  				batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main")
   299  				must.Len(t, 5, batches[0])
   300  				must.StrContains(t, stdout, "Restarting 1st batch")
   301  				must.StrNotContains(t, stdout, "restarting the next batch")
   302  
   303  			},
   304  		},
   305  		{
   306  			name: "restart specific task in all groups",
   307  			args: []string{"-batch-size", "100%", "-task", "main"},
   308  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   309  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   310  					"single_task": {
   311  						"main": true,
   312  					},
   313  					"multiple_tasks": {
   314  						"prestart": false,
   315  						"sidecar":  false,
   316  						"main":     true,
   317  					},
   318  				})
   319  
   320  				// Check that allocations restarted in a single batch.
   321  				batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main")
   322  				must.Len(t, 5, batches[0])
   323  				must.StrContains(t, stdout, "Restarting 1st batch")
   324  				must.StrNotContains(t, stdout, "restarting the next batch")
   325  			},
   326  		},
   327  		{
   328  			name: "restart multiple tasks in all groups",
   329  			args: []string{"-batch-size", "100%", "-task", "main", "-task", "sidecar"},
   330  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   331  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   332  					"single_task": {
   333  						"main": true,
   334  					},
   335  					"multiple_tasks": {
   336  						"prestart": false,
   337  						"sidecar":  true,
   338  						"main":     true,
   339  					},
   340  				})
   341  
   342  				// Check that allocations restarted in a single batch.
   343  				batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main")
   344  				must.Len(t, 5, batches[0])
   345  				must.StrContains(t, stdout, "Restarting 1st batch")
   346  				must.StrNotContains(t, stdout, "restarting the next batch")
   347  			},
   348  		},
   349  		{
   350  			name: "restart all tasks in all groups",
   351  			args: []string{"-batch-size", "100%", "-all-tasks"},
   352  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   353  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   354  					"single_task": {
   355  						"main": true,
   356  					},
   357  					"multiple_tasks": {
   358  						"prestart": true,
   359  						"sidecar":  true,
   360  						"main":     true,
   361  					},
   362  				})
   363  
   364  				// Check that allocations restarted in a single batch.
   365  				batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main")
   366  				must.Len(t, 5, batches[0])
   367  				must.StrContains(t, stdout, "Restarting 1st batch")
   368  				must.StrNotContains(t, stdout, "restarting the next batch")
   369  			},
   370  		},
   371  		{
   372  			name: "restart running tasks in specific group",
   373  			args: []string{"-batch-size", "100%", "-group", "single_task"},
   374  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   375  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   376  					"single_task": {
   377  						"main": true,
   378  					},
   379  					"multiple_tasks": {
   380  						"prestart": false,
   381  						"sidecar":  false,
   382  						"main":     false,
   383  					},
   384  				})
   385  
   386  				// Check that allocations restarted in a single batch.
   387  				batches := getRestartBatches(restarted, []string{"single_task"}, "main")
   388  				must.Len(t, 3, batches[0])
   389  				must.StrContains(t, stdout, "Restarting 1st batch")
   390  				must.StrNotContains(t, stdout, "restarting the next batch")
   391  
   392  			},
   393  		},
   394  		{
   395  			name: "restart specific task that is not running",
   396  			args: []string{"-batch-size", "100%", "-task", "prestart"},
   397  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   398  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   399  					"single_task": {
   400  						"main": false,
   401  					},
   402  					"multiple_tasks": {
   403  						"prestart": false,
   404  						"sidecar":  false,
   405  						"main":     false,
   406  					},
   407  				})
   408  
   409  				// Check that allocations restarted in a single batch.
   410  				batches := getRestartBatches(restarted, []string{"single_task"}, "main")
   411  				must.Len(t, 3, batches[0])
   412  				must.StrContains(t, stdout, "Restarting 1st batch")
   413  				must.StrNotContains(t, stdout, "restarting the next batch")
   414  
   415  				// Check that we have an error message.
   416  				must.StrContains(t, stderr, "Task not running")
   417  			},
   418  			expectedCode: 1,
   419  		},
   420  		{
   421  			name: "restart specific task in specific group",
   422  			args: []string{"-batch-size", "100%", "-task", "main", "-group", "single_task"},
   423  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   424  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   425  					"single_task": {
   426  						"main": true,
   427  					},
   428  					"multiple_tasks": {
   429  						"prestart": false,
   430  						"sidecar":  false,
   431  						"main":     false,
   432  					},
   433  				})
   434  
   435  				// Check that allocations restarted in a single batch.
   436  				batches := getRestartBatches(restarted, []string{"single_task"}, "main")
   437  				must.Len(t, 3, batches[0])
   438  				must.StrContains(t, stdout, "Restarting 1st batch")
   439  				must.StrNotContains(t, stdout, "restarting the next batch")
   440  			},
   441  		},
   442  		{
   443  			name: "restart multiple tasks in specific group",
   444  			args: []string{"-batch-size", "100%", "-task", "main", "-task", "sidecar", "-group", "multiple_tasks"},
   445  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   446  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   447  					"single_task": {
   448  						"main": false,
   449  					},
   450  					"multiple_tasks": {
   451  						"prestart": false,
   452  						"sidecar":  true,
   453  						"main":     true,
   454  					},
   455  				})
   456  
   457  				// Check that allocations restarted in a single batch.
   458  				batches := getRestartBatches(restarted, []string{"multiple_tasks"}, "main")
   459  				must.Len(t, 2, batches[0])
   460  				must.StrContains(t, stdout, "Restarting 1st batch")
   461  				must.StrNotContains(t, stdout, "restarting the next batch")
   462  			},
   463  		},
   464  		{
   465  			name: "restart all tasks in specific group",
   466  			args: []string{"-batch-size", "100%", "-all-tasks", "-group", "multiple_tasks"},
   467  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   468  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   469  					"single_task": {
   470  						"main": false,
   471  					},
   472  					"multiple_tasks": {
   473  						"prestart": true,
   474  						"sidecar":  true,
   475  						"main":     true,
   476  					},
   477  				})
   478  
   479  				// Check that allocations restarted in a single batch.
   480  				batches := getRestartBatches(restarted, []string{"multiple_tasks"}, "main")
   481  				must.Len(t, 2, batches[0])
   482  				must.StrContains(t, stdout, "Restarting 1st batch")
   483  				must.StrNotContains(t, stdout, "restarting the next batch")
   484  			},
   485  		},
   486  		{
   487  			name: "restart in batches",
   488  			args: []string{"-batch-size", "3", "-batch-wait", "3s", "-task", "main"},
   489  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   490  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   491  					"single_task": {
   492  						"main": true,
   493  					},
   494  					"multiple_tasks": {
   495  						"prestart": false,
   496  						"sidecar":  false,
   497  						"main":     true,
   498  					},
   499  				})
   500  
   501  				// Check that allocations were properly batched.
   502  				batches := getRestartBatches(restarted, []string{"multiple_tasks", "single_task"}, "main")
   503  
   504  				must.Len(t, 3, batches[0])
   505  				must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations")
   506  
   507  				must.Len(t, 2, batches[1])
   508  				must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations")
   509  
   510  				// Check that we only waited between batches.
   511  				waitMsgCount := strings.Count(stdout, "Waiting 3s before restarting the next batch")
   512  				must.Eq(t, 1, waitMsgCount)
   513  
   514  				// Check that batches waited the expected time.
   515  				batch1Restart := batches[0][0].TaskStates["main"].LastRestart
   516  				batch2Restart := batches[1][0].TaskStates["main"].LastRestart
   517  				diff := batch2Restart.Sub(batch1Restart)
   518  				must.Between(t, 3*time.Second, diff, 4*time.Second)
   519  			},
   520  		},
   521  		{
   522  			name: "restart in percent batch",
   523  			args: []string{"-batch-size", "50%", "-batch-wait", "3s", "-task", "main"},
   524  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   525  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   526  					"single_task": {
   527  						"main": true,
   528  					},
   529  					"multiple_tasks": {
   530  						"prestart": false,
   531  						"sidecar":  false,
   532  						"main":     true,
   533  					},
   534  				})
   535  
   536  				// Check that allocations were properly batched.
   537  				batches := getRestartBatches(restarted, []string{"multiple_tasks", "single_task"}, "main")
   538  
   539  				must.Len(t, 3, batches[0])
   540  				must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations")
   541  
   542  				must.Len(t, 2, batches[1])
   543  				must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations")
   544  
   545  				// Check that we only waited between batches.
   546  				waitMsgCount := strings.Count(stdout, "Waiting 3s before restarting the next batch")
   547  				must.Eq(t, 1, waitMsgCount)
   548  
   549  				// Check that batches waited the expected time.
   550  				batch1Restart := batches[0][0].TaskStates["main"].LastRestart
   551  				batch2Restart := batches[1][0].TaskStates["main"].LastRestart
   552  				diff := batch2Restart.Sub(batch1Restart)
   553  				must.Between(t, 3*time.Second, diff, 4*time.Second)
   554  			},
   555  		},
   556  		{
   557  			name: "restart in batch ask with yes",
   558  			args: []string{"-batch-size", "100%", "-batch-wait", "ask", "-yes", "-group", "single_task"},
   559  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   560  				restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{
   561  					"single_task": {
   562  						"main": true,
   563  					},
   564  					"multiple_tasks": {
   565  						"prestart": false,
   566  						"sidecar":  false,
   567  						"main":     false,
   568  					},
   569  				})
   570  
   571  				// Check that allocations restarted in a single batch.
   572  				batches := getRestartBatches(restarted, []string{"single_task"}, "main")
   573  				must.Len(t, 3, batches[0])
   574  				must.StrContains(t, stdout, "Restarting 1st batch")
   575  				must.StrNotContains(t, stdout, "restarting the next batch")
   576  			},
   577  		},
   578  		{
   579  			name: "reschedule in batches",
   580  			args: []string{"-reschedule", "-batch-size", "3"},
   581  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   582  				// Expect all allocations were rescheduled.
   583  				reschedules := map[string]bool{}
   584  				for _, alloc := range allocs {
   585  					reschedules[alloc.ID] = true
   586  				}
   587  				waitAllocsRescheduled(t, client, reschedules)
   588  
   589  				// Check that allocations were properly batched.
   590  				must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations")
   591  				must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations")
   592  				must.StrNotContains(t, stdout, "Waiting")
   593  			},
   594  		},
   595  		{
   596  			name: "reschedule specific group",
   597  			args: []string{"-reschedule", "-batch-size", "100%", "-group", "single_task"},
   598  			validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) {
   599  				// Expect that only allocs for the single_task group were
   600  				// rescheduled.
   601  				reschedules := map[string]bool{}
   602  				for _, alloc := range allocs {
   603  					if alloc.TaskGroup == "single_task" {
   604  						reschedules[alloc.ID] = true
   605  					}
   606  				}
   607  				waitAllocsRescheduled(t, client, reschedules)
   608  
   609  				// Check that allocations restarted in a single batch.
   610  				must.StrContains(t, stdout, "Restarting 1st batch")
   611  				must.StrNotContains(t, stdout, "restarting the next batch")
   612  			},
   613  		},
   614  	}
   615  
   616  	for _, tc := range testCases {
   617  		tc := tc
   618  		t.Run(tc.name, func(t *testing.T) {
   619  			// Run each test case in parallel because they are fairly slow.
   620  			ci.Parallel(t)
   621  
   622  			// Initialize UI and command.
   623  			ui := cli.NewMockUi()
   624  			cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
   625  
   626  			// Start client and server and wait for node to be ready.
   627  			// User separate cluster for each test case so they can run in
   628  			// parallel without affecting each other.
   629  			srv, client, url := testServer(t, true, nil)
   630  			defer srv.Shutdown()
   631  
   632  			waitForNodes(t, client)
   633  
   634  			// Register test job and wait for its allocs to be running.
   635  			resp, _, err := client.Jobs().Register(job, nil)
   636  			must.NoError(t, err)
   637  
   638  			code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
   639  			must.Zero(t, code)
   640  
   641  			allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil)
   642  			must.NoError(t, err)
   643  			for _, alloc := range allocStubs {
   644  				waitForAllocRunning(t, client, alloc.ID)
   645  			}
   646  
   647  			// Fetch allocations before the restart so we know which ones are
   648  			// supposed to be affected in case the test reschedules allocs.
   649  			allocStubs, _, err = client.Jobs().Allocations(jobID, true, nil)
   650  			must.NoError(t, err)
   651  
   652  			// Prepend server URL and append job ID to the test case command.
   653  			args := []string{"-address", url, "-yes"}
   654  			args = append(args, tc.args...)
   655  			args = append(args, jobID)
   656  
   657  			// Run job restart command.
   658  			code = cmd.Run(args)
   659  			must.Eq(t, code, tc.expectedCode)
   660  
   661  			// Run test case validation function.
   662  			if tc.validateFn != nil {
   663  				tc.validateFn(t, client, allocStubs, ui.OutputWriter.String(), ui.ErrorWriter.String())
   664  			}
   665  		})
   666  	}
   667  }
   668  
   669  func TestJobRestartCommand_jobPrefixAndNamespace(t *testing.T) {
   670  	ci.Parallel(t)
   671  
   672  	ui := cli.NewMockUi()
   673  
   674  	// Start client and server and wait for node to be ready.
   675  	srv, client, url := testServer(t, true, nil)
   676  	defer srv.Shutdown()
   677  
   678  	waitForNodes(t, client)
   679  
   680  	// Create non-default namespace.
   681  	_, err := client.Namespaces().Register(&api.Namespace{Name: "prod"}, nil)
   682  	must.NoError(t, err)
   683  
   684  	// Register job with same name in both namespaces.
   685  	evalIDs := []string{}
   686  
   687  	jobDefault := testJob("test_job_restart")
   688  	resp, _, err := client.Jobs().Register(jobDefault, nil)
   689  	must.NoError(t, err)
   690  	evalIDs = append(evalIDs, resp.EvalID)
   691  
   692  	jobProd := testJob("test_job_restart")
   693  	jobProd.Namespace = pointer.Of("prod")
   694  	resp, _, err = client.Jobs().Register(jobProd, nil)
   695  	must.NoError(t, err)
   696  	evalIDs = append(evalIDs, resp.EvalID)
   697  
   698  	jobUniqueProd := testJob("test_job_restart_prod_ns")
   699  	jobUniqueProd.Namespace = pointer.Of("prod")
   700  	resp, _, err = client.Jobs().Register(jobUniqueProd, nil)
   701  	must.NoError(t, err)
   702  	evalIDs = append(evalIDs, resp.EvalID)
   703  
   704  	// Wait for evals to be processed.
   705  	for _, evalID := range evalIDs {
   706  		code := waitForSuccess(ui, client, fullId, t, evalID)
   707  		must.Eq(t, 0, code)
   708  	}
   709  	ui.OutputWriter.Reset()
   710  
   711  	testCases := []struct {
   712  		name        string
   713  		args        []string
   714  		expectedErr string
   715  	}{
   716  		{
   717  			name: "prefix match in default namespace",
   718  			args: []string{"test_job"},
   719  		},
   720  		{
   721  			name:        "invalid job",
   722  			args:        []string{"not-valid"},
   723  			expectedErr: "No job(s) with prefix or ID",
   724  		},
   725  		{
   726  			name:        "prefix matches multiple jobs",
   727  			args:        []string{"-namespace", "prod", "test_job"},
   728  			expectedErr: "matched multiple jobs",
   729  		},
   730  		{
   731  			name:        "prefix matches multiple jobs across namespaces",
   732  			args:        []string{"-namespace", "*", "test_job"},
   733  			expectedErr: "matched multiple jobs",
   734  		},
   735  		{
   736  			name: "unique prefix match across namespaces",
   737  			args: []string{"-namespace", "*", "test_job_restart_prod"},
   738  		},
   739  	}
   740  
   741  	for _, tc := range testCases {
   742  		t.Run(tc.name, func(t *testing.T) {
   743  			defer func() {
   744  				ui.OutputWriter.Reset()
   745  				ui.ErrorWriter.Reset()
   746  			}()
   747  
   748  			cmd := &JobRestartCommand{
   749  				Meta: Meta{Ui: &cli.ConcurrentUi{Ui: ui}},
   750  			}
   751  			args := append([]string{"-address", url, "-yes"}, tc.args...)
   752  			code := cmd.Run(args)
   753  
   754  			if tc.expectedErr != "" {
   755  				must.NonZero(t, code)
   756  				must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr)
   757  			} else {
   758  				must.Zero(t, code)
   759  			}
   760  		})
   761  	}
   762  }
   763  
   764  func TestJobRestartCommand_noAllocs(t *testing.T) {
   765  	ci.Parallel(t)
   766  
   767  	ui := cli.NewMockUi()
   768  	cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
   769  
   770  	// Start client and server and wait for node to be ready.
   771  	srv, client, url := testServer(t, true, nil)
   772  	defer srv.Shutdown()
   773  
   774  	waitForNodes(t, client)
   775  
   776  	// Register test job with impossible constraint so it doesn't get allocs.
   777  	jobID := "test_job_restart_no_allocs"
   778  	job := testJob(jobID)
   779  	job.Datacenters = []string{"invalid"}
   780  
   781  	resp, _, err := client.Jobs().Register(job, nil)
   782  	must.NoError(t, err)
   783  
   784  	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
   785  	must.Eq(t, 2, code) // Placement is expected to fail so exit code is not 0.
   786  	ui.OutputWriter.Reset()
   787  
   788  	// Run job restart command and expect it to exit without restarts.
   789  	code = cmd.Run([]string{
   790  		"-address", url,
   791  		"-yes",
   792  		jobID,
   793  	})
   794  	must.Zero(t, code)
   795  	must.StrContains(t, ui.OutputWriter.String(), "No allocations to restart")
   796  }
   797  
   798  func TestJobRestartCommand_rescheduleFail(t *testing.T) {
   799  	ci.Parallel(t)
   800  
   801  	ui := cli.NewMockUi()
   802  	cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
   803  
   804  	// Start client and server and wait for node to be ready.
   805  	srv, client, url := testServer(t, true, nil)
   806  	defer srv.Shutdown()
   807  
   808  	waitForNodes(t, client)
   809  
   810  	// Register test job with 3 allocs.
   811  	jobID := "test_job_restart_reschedule_fail"
   812  	job := testJob(jobID)
   813  	job.TaskGroups[0].Count = pointer.Of(3)
   814  
   815  	resp, _, err := client.Jobs().Register(job, nil)
   816  	must.NoError(t, err)
   817  
   818  	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
   819  	must.Zero(t, code)
   820  	ui.OutputWriter.Reset()
   821  
   822  	// Wait for allocs to be running.
   823  	allocs, _, err := client.Jobs().Allocations(jobID, true, nil)
   824  	must.NoError(t, err)
   825  	for _, alloc := range allocs {
   826  		waitForAllocRunning(t, client, alloc.ID)
   827  	}
   828  
   829  	// Mark node as ineligible to prevent allocs from being replaced.
   830  	nodeID := srv.Agent.Client().NodeID()
   831  	client.Nodes().ToggleEligibility(nodeID, false, nil)
   832  
   833  	// Run job restart command and expect it to fail.
   834  	code = cmd.Run([]string{
   835  		"-address", url,
   836  		"-batch-size", "2",
   837  		"-reschedule",
   838  		"-yes",
   839  		jobID,
   840  	})
   841  	must.One(t, code)
   842  	must.StrContains(t, ui.ErrorWriter.String(), "No nodes were eligible for evaluation")
   843  }
   844  
   845  func TestJobRestartCommand_monitorReplacementAlloc(t *testing.T) {
   846  	ci.Parallel(t)
   847  
   848  	ui := cli.NewMockUi()
   849  	cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
   850  
   851  	srv, client, _ := testServer(t, true, nil)
   852  	defer srv.Shutdown()
   853  	waitForNodes(t, client)
   854  
   855  	// Register test job and update it twice so we end up with three
   856  	// allocations, one replacing the next one.
   857  	jobID := "test_job_restart_monitor_replacement"
   858  	job := testJob(jobID)
   859  
   860  	for i := 1; i <= 3; i++ {
   861  		job.TaskGroups[0].Tasks[0].Config["run_for"] = fmt.Sprintf("%ds", i)
   862  		resp, _, err := client.Jobs().Register(job, nil)
   863  		must.NoError(t, err)
   864  
   865  		code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
   866  		must.Zero(t, code)
   867  	}
   868  	ui.OutputWriter.Reset()
   869  
   870  	// Prepare the command internals. We want to run a specific function and
   871  	// target a specific allocation, so we can't run the full command.
   872  	cmd.client = client
   873  	cmd.verbose = true
   874  	cmd.length = fullId
   875  
   876  	// Fetch, sort, and monitor the oldest allocation.
   877  	allocs, _, err := client.Jobs().Allocations(jobID, true, nil)
   878  	must.NoError(t, err)
   879  	sort.Slice(allocs, func(i, j int) bool {
   880  		return allocs[i].CreateIndex < allocs[j].CreateIndex
   881  	})
   882  
   883  	errCh := make(chan error)
   884  	go cmd.monitorReplacementAlloc(context.Background(), AllocationListStubWithJob{
   885  		AllocationListStub: allocs[0],
   886  		Job:                job,
   887  	}, errCh)
   888  
   889  	// Make sure the command doesn't get stuck and that we traverse the
   890  	// follow-up allocations properly.
   891  	must.Wait(t, wait.InitialSuccess(
   892  		wait.ErrorFunc(func() error {
   893  			select {
   894  			case err := <-errCh:
   895  				return err
   896  			default:
   897  				return fmt.Errorf("waiting for response")
   898  			}
   899  		}),
   900  		wait.Timeout(time.Duration(testutil.TestMultiplier()*3)*time.Second),
   901  	))
   902  	must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q replaced by %q", allocs[0].ID, allocs[1].ID))
   903  	must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q replaced by %q", allocs[1].ID, allocs[2].ID))
   904  	must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q is %q", allocs[2].ID, api.AllocClientStatusRunning))
   905  }
   906  
   907  func TestJobRestartCommand_activeDeployment(t *testing.T) {
   908  	ci.Parallel(t)
   909  
   910  	srv, client, url := testServer(t, true, nil)
   911  	defer srv.Shutdown()
   912  	waitForNodes(t, client)
   913  
   914  	// Register test job and update it once to trigger a deployment.
   915  	jobID := "test_job_restart_deployment"
   916  	job := testJob(jobID)
   917  	job.Type = pointer.Of(api.JobTypeService)
   918  	job.Update = &api.UpdateStrategy{
   919  		Canary:      pointer.Of(1),
   920  		AutoPromote: pointer.Of(false),
   921  	}
   922  
   923  	_, _, err := client.Jobs().Register(job, nil)
   924  	must.NoError(t, err)
   925  
   926  	_, _, err = client.Jobs().Register(job, nil)
   927  	must.NoError(t, err)
   928  
   929  	// Wait for a deployment to be running.
   930  	must.Wait(t, wait.InitialSuccess(
   931  		wait.ErrorFunc(func() error {
   932  			deployments, _, err := client.Jobs().Deployments(jobID, true, nil)
   933  			if err != nil {
   934  				return err
   935  			}
   936  			for _, d := range deployments {
   937  				if d.Status == api.DeploymentStatusRunning {
   938  					return nil
   939  				}
   940  			}
   941  			return fmt.Errorf("no running deployments")
   942  		}),
   943  		wait.Timeout(time.Duration(testutil.TestMultiplier()*3)*time.Second),
   944  	))
   945  
   946  	// Run job restart command and expect it to fail.
   947  	ui := cli.NewMockUi()
   948  	cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
   949  
   950  	code := cmd.Run([]string{
   951  		"-address", url,
   952  		"-on-error", jobRestartOnErrorFail,
   953  		"-verbose",
   954  		jobID,
   955  	})
   956  	must.One(t, code)
   957  	must.RegexMatch(t, regexp.MustCompile(`Deployment .+ is "running"`), ui.ErrorWriter.String())
   958  }
   959  
   960  func TestJobRestartCommand_ACL(t *testing.T) {
   961  	ci.Parallel(t)
   962  
   963  	// Start server with ACL enabled.
   964  	srv, client, url := testServer(t, true, func(c *agent.Config) {
   965  		c.ACL.Enabled = true
   966  	})
   967  	defer srv.Shutdown()
   968  
   969  	rootTokenOpts := &api.WriteOptions{
   970  		AuthToken: srv.RootToken.SecretID,
   971  	}
   972  
   973  	// Register test job.
   974  	jobID := "test_job_restart_acl"
   975  	job := testJob(jobID)
   976  	_, _, err := client.Jobs().Register(job, rootTokenOpts)
   977  	must.NoError(t, err)
   978  
   979  	// Wait for allocs to be running.
   980  	waitForJobAllocsStatus(t, client, jobID, api.AllocClientStatusRunning, srv.RootToken.SecretID)
   981  
   982  	testCases := []struct {
   983  		name        string
   984  		jobPrefix   bool
   985  		aclPolicy   string
   986  		expectedErr string
   987  	}{
   988  		{
   989  			name:        "no token",
   990  			aclPolicy:   "",
   991  			expectedErr: api.PermissionDeniedErrorContent,
   992  		},
   993  		{
   994  			name: "alloc-lifecycle not enough",
   995  			aclPolicy: `
   996  namespace "default" {
   997  	capabilities = ["alloc-lifecycle"]
   998  }
   999  `,
  1000  			expectedErr: api.PermissionDeniedErrorContent,
  1001  		},
  1002  		{
  1003  			name: "read-job not enough",
  1004  			aclPolicy: `
  1005  namespace "default" {
  1006  	capabilities = ["read-job"]
  1007  }
  1008  `,
  1009  			expectedErr: api.PermissionDeniedErrorContent,
  1010  		},
  1011  		{
  1012  			name: "alloc-lifecycle and read-job allowed",
  1013  			aclPolicy: `
  1014  namespace "default" {
  1015  	capabilities = ["alloc-lifecycle", "read-job"]
  1016  }
  1017  `,
  1018  		},
  1019  		{
  1020  			name: "job prefix requires list-jobs",
  1021  			aclPolicy: `
  1022  namespace "default" {
  1023  	capabilities = ["alloc-lifecycle", "read-job"]
  1024  }
  1025  `,
  1026  			jobPrefix:   true,
  1027  			expectedErr: "job not found",
  1028  		},
  1029  		{
  1030  			name: "job prefix works with list-jobs",
  1031  			aclPolicy: `
  1032  namespace "default" {
  1033  	capabilities = ["list-jobs", "alloc-lifecycle", "read-job"]
  1034  }
  1035  `,
  1036  			jobPrefix: true,
  1037  		},
  1038  	}
  1039  
  1040  	for _, tc := range testCases {
  1041  		t.Run(tc.name, func(t *testing.T) {
  1042  			ui := cli.NewMockUi()
  1043  			cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
  1044  			args := []string{
  1045  				"-address", url,
  1046  				"-yes",
  1047  			}
  1048  
  1049  			if tc.aclPolicy != "" {
  1050  				// Create ACL token with test case policy.
  1051  				policy := &api.ACLPolicy{
  1052  					Name:  nonAlphaNum.ReplaceAllString(tc.name, "-"),
  1053  					Rules: tc.aclPolicy,
  1054  				}
  1055  				_, err := client.ACLPolicies().Upsert(policy, rootTokenOpts)
  1056  				must.NoError(t, err)
  1057  
  1058  				token := &api.ACLToken{
  1059  					Type:     "client",
  1060  					Policies: []string{policy.Name},
  1061  				}
  1062  				token, _, err = client.ACLTokens().Create(token, rootTokenOpts)
  1063  				must.NoError(t, err)
  1064  
  1065  				// Set token in command args.
  1066  				args = append(args, "-token", token.SecretID)
  1067  			}
  1068  
  1069  			// Add job ID or job ID prefix to the command.
  1070  			if tc.jobPrefix {
  1071  				args = append(args, jobID[0:3])
  1072  			} else {
  1073  				args = append(args, jobID)
  1074  			}
  1075  
  1076  			// Run command.
  1077  			code := cmd.Run(args)
  1078  			if tc.expectedErr == "" {
  1079  				must.Zero(t, code)
  1080  			} else {
  1081  				must.One(t, code)
  1082  				must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr)
  1083  			}
  1084  		})
  1085  	}
  1086  }
  1087  
  1088  // TODO(luiz): update once alloc restart supports -no-shutdown-delay.
  1089  func TestJobRestartCommand_shutdownDelay_reschedule(t *testing.T) {
  1090  	ci.Parallel(t)
  1091  
  1092  	// Start client and server and wait for node to be ready.
  1093  	srv, client, url := testServer(t, true, nil)
  1094  	defer srv.Shutdown()
  1095  
  1096  	waitForNodes(t, client)
  1097  
  1098  	testCases := []struct {
  1099  		name          string
  1100  		args          []string
  1101  		shutdownDelay bool
  1102  	}{
  1103  		{
  1104  			name:          "job reschedule with shutdown delay by default",
  1105  			args:          []string{"-reschedule"},
  1106  			shutdownDelay: true,
  1107  		},
  1108  		{
  1109  			name:          "job reschedule no shutdown delay",
  1110  			args:          []string{"-reschedule", "-no-shutdown-delay"},
  1111  			shutdownDelay: false,
  1112  		},
  1113  	}
  1114  
  1115  	for _, tc := range testCases {
  1116  		t.Run(tc.name, func(t *testing.T) {
  1117  			ui := cli.NewMockUi()
  1118  			cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
  1119  
  1120  			// Register job with 2 allocations and shutdown_delay.
  1121  			shutdownDelay := 3 * time.Second
  1122  			jobID := nonAlphaNum.ReplaceAllString(tc.name, "-")
  1123  
  1124  			job := testJob(jobID)
  1125  			job.TaskGroups[0].Count = pointer.Of(2)
  1126  			job.TaskGroups[0].Tasks[0].Config["run_for"] = "10m"
  1127  			job.TaskGroups[0].Tasks[0].ShutdownDelay = shutdownDelay
  1128  			job.TaskGroups[0].Tasks[0].Services = []*api.Service{{
  1129  				Name:     "service",
  1130  				Provider: "nomad",
  1131  			}}
  1132  
  1133  			resp, _, err := client.Jobs().Register(job, nil)
  1134  			must.NoError(t, err)
  1135  
  1136  			code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
  1137  			must.Zero(t, code)
  1138  			ui.OutputWriter.Reset()
  1139  
  1140  			// Wait for alloc to be running.
  1141  			allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil)
  1142  			must.NoError(t, err)
  1143  			for _, alloc := range allocStubs {
  1144  				waitForAllocRunning(t, client, alloc.ID)
  1145  			}
  1146  
  1147  			// Add address and job ID to the command and run.
  1148  			args := []string{
  1149  				"-address", url,
  1150  				"-batch-size", "1",
  1151  				"-batch-wait", "0",
  1152  				"-yes",
  1153  			}
  1154  			args = append(args, tc.args...)
  1155  			args = append(args, jobID)
  1156  
  1157  			code = cmd.Run(args)
  1158  			must.Zero(t, code)
  1159  
  1160  			// Wait for all allocs to restart.
  1161  			reschedules := map[string]bool{}
  1162  			for _, alloc := range allocStubs {
  1163  				reschedules[alloc.ID] = true
  1164  			}
  1165  			allocs := waitAllocsRescheduled(t, client, reschedules)
  1166  
  1167  			// Check that allocs have shutdown delay event.
  1168  			for _, alloc := range allocs {
  1169  				for _, s := range alloc.TaskStates {
  1170  					var killedEv *api.TaskEvent
  1171  					var killingEv *api.TaskEvent
  1172  					for _, ev := range s.Events {
  1173  						if strings.Contains(ev.Type, "Killed") {
  1174  							killedEv = ev
  1175  						}
  1176  						if strings.Contains(ev.Type, "Killing") {
  1177  							killingEv = ev
  1178  						}
  1179  					}
  1180  
  1181  					diff := killedEv.Time - killingEv.Time
  1182  					if tc.shutdownDelay {
  1183  						must.GreaterEq(t, shutdownDelay, time.Duration(diff))
  1184  					} else {
  1185  						// Add a bit of slack to account for the actual
  1186  						// shutdown time of the task.
  1187  						must.Between(t, shutdownDelay, time.Duration(diff), shutdownDelay+time.Second)
  1188  					}
  1189  				}
  1190  			}
  1191  		})
  1192  	}
  1193  }
  1194  
  1195  func TestJobRestartCommand_filterAllocs(t *testing.T) {
  1196  	ci.Parallel(t)
  1197  
  1198  	task1 := api.NewTask("task_1", "mock_driver")
  1199  	task2 := api.NewTask("task_2", "mock_driver")
  1200  	task3 := api.NewTask("task_3", "mock_driver")
  1201  
  1202  	jobV1 := api.NewServiceJob("example", "example", "global", 1).
  1203  		AddTaskGroup(
  1204  			api.NewTaskGroup("group_1", 1).
  1205  				AddTask(task1),
  1206  		).
  1207  		AddTaskGroup(
  1208  			api.NewTaskGroup("group_2", 1).
  1209  				AddTask(task1).
  1210  				AddTask(task2),
  1211  		).
  1212  		AddTaskGroup(
  1213  			api.NewTaskGroup("group_3", 1).
  1214  				AddTask(task3),
  1215  		)
  1216  	jobV1.Version = pointer.Of(uint64(1))
  1217  
  1218  	jobV2 := api.NewServiceJob("example", "example", "global", 1).
  1219  		AddTaskGroup(
  1220  			api.NewTaskGroup("group_1", 1).
  1221  				AddTask(task1),
  1222  		).
  1223  		AddTaskGroup(
  1224  			api.NewTaskGroup("group_2", 1).
  1225  				AddTask(task2),
  1226  		)
  1227  	jobV2.Version = pointer.Of(uint64(2))
  1228  
  1229  	allAllocs := []AllocationListStubWithJob{}
  1230  	allocs := map[string]AllocationListStubWithJob{}
  1231  	for _, job := range []*api.Job{jobV1, jobV2} {
  1232  		for _, tg := range job.TaskGroups {
  1233  			for _, desired := range []string{api.AllocDesiredStatusRun, api.AllocDesiredStatusStop} {
  1234  				for _, client := range []string{api.AllocClientStatusRunning, api.AllocClientStatusComplete} {
  1235  					key := fmt.Sprintf("job_v%d_%s_%s_%s", *job.Version, *tg.Name, desired, client)
  1236  					alloc := AllocationListStubWithJob{
  1237  						AllocationListStub: &api.AllocationListStub{
  1238  							ID:            key,
  1239  							JobVersion:    *job.Version,
  1240  							TaskGroup:     *tg.Name,
  1241  							DesiredStatus: desired,
  1242  							ClientStatus:  client,
  1243  						},
  1244  						Job: job,
  1245  					}
  1246  					allocs[key] = alloc
  1247  					allAllocs = append(allAllocs, alloc)
  1248  				}
  1249  			}
  1250  		}
  1251  	}
  1252  
  1253  	testCases := []struct {
  1254  		name           string
  1255  		args           []string
  1256  		expectedAllocs []AllocationListStubWithJob
  1257  	}{
  1258  		{
  1259  			name: "skip by group",
  1260  			args: []string{"-group", "group_1"},
  1261  			expectedAllocs: []AllocationListStubWithJob{
  1262  				allocs["job_v1_group_1_run_running"],
  1263  				allocs["job_v1_group_1_run_complete"],
  1264  				allocs["job_v1_group_1_stop_running"],
  1265  				allocs["job_v2_group_1_run_running"],
  1266  				allocs["job_v2_group_1_run_complete"],
  1267  				allocs["job_v2_group_1_stop_running"],
  1268  			},
  1269  		},
  1270  		{
  1271  			name: "skip by old group",
  1272  			args: []string{"-group", "group_3"},
  1273  			expectedAllocs: []AllocationListStubWithJob{
  1274  				allocs["job_v1_group_3_run_running"],
  1275  				allocs["job_v1_group_3_run_complete"],
  1276  				allocs["job_v1_group_3_stop_running"],
  1277  			},
  1278  		},
  1279  		{
  1280  			name: "skip by task",
  1281  			args: []string{"-task", "task_2"},
  1282  			expectedAllocs: []AllocationListStubWithJob{
  1283  				allocs["job_v1_group_2_run_running"],
  1284  				allocs["job_v1_group_2_run_complete"],
  1285  				allocs["job_v1_group_2_stop_running"],
  1286  				allocs["job_v2_group_2_run_running"],
  1287  				allocs["job_v2_group_2_run_complete"],
  1288  				allocs["job_v2_group_2_stop_running"],
  1289  			},
  1290  		},
  1291  		{
  1292  			name: "skip by old task",
  1293  			args: []string{"-task", "task_3"},
  1294  			expectedAllocs: []AllocationListStubWithJob{
  1295  				allocs["job_v1_group_3_run_running"],
  1296  				allocs["job_v1_group_3_run_complete"],
  1297  				allocs["job_v1_group_3_stop_running"],
  1298  			},
  1299  		},
  1300  		{
  1301  			name: "skip by group and task",
  1302  			args: []string{
  1303  				"-group", "group_1",
  1304  				"-group", "group_2",
  1305  				"-task", "task_2",
  1306  			},
  1307  			// Only group_2 has task_2 in all job versions.
  1308  			expectedAllocs: []AllocationListStubWithJob{
  1309  				allocs["job_v1_group_2_run_running"],
  1310  				allocs["job_v1_group_2_run_complete"],
  1311  				allocs["job_v1_group_2_stop_running"],
  1312  				allocs["job_v2_group_2_run_running"],
  1313  				allocs["job_v2_group_2_run_complete"],
  1314  				allocs["job_v2_group_2_stop_running"],
  1315  			},
  1316  		},
  1317  		{
  1318  			name: "skip by status",
  1319  			args: []string{},
  1320  			expectedAllocs: []AllocationListStubWithJob{
  1321  				allocs["job_v1_group_1_run_running"],
  1322  				allocs["job_v1_group_1_run_complete"],
  1323  				allocs["job_v1_group_1_stop_running"],
  1324  				allocs["job_v1_group_2_run_running"],
  1325  				allocs["job_v1_group_2_run_complete"],
  1326  				allocs["job_v1_group_2_stop_running"],
  1327  				allocs["job_v1_group_3_run_running"],
  1328  				allocs["job_v1_group_3_run_complete"],
  1329  				allocs["job_v1_group_3_stop_running"],
  1330  				allocs["job_v2_group_1_run_running"],
  1331  				allocs["job_v2_group_1_run_complete"],
  1332  				allocs["job_v2_group_1_stop_running"],
  1333  				allocs["job_v2_group_2_run_running"],
  1334  				allocs["job_v2_group_2_run_complete"],
  1335  				allocs["job_v2_group_2_stop_running"],
  1336  			},
  1337  		},
  1338  		{
  1339  			name:           "no matches by group",
  1340  			args:           []string{"-group", "group_404"},
  1341  			expectedAllocs: []AllocationListStubWithJob{},
  1342  		},
  1343  		{
  1344  			name:           "no matches by task",
  1345  			args:           []string{"-task", "task_404"},
  1346  			expectedAllocs: []AllocationListStubWithJob{},
  1347  		},
  1348  		{
  1349  			name: "no matches by task with group",
  1350  			args: []string{
  1351  				"-group", "group_1",
  1352  				"-task", "task_2", // group_1 never has task_2.
  1353  			},
  1354  			expectedAllocs: []AllocationListStubWithJob{},
  1355  		},
  1356  	}
  1357  
  1358  	for _, tc := range testCases {
  1359  		t.Run(tc.name, func(t *testing.T) {
  1360  			ui := cli.NewMockUi()
  1361  			cmd := &JobRestartCommand{
  1362  				Meta: Meta{Ui: &cli.ConcurrentUi{Ui: ui}},
  1363  			}
  1364  
  1365  			args := append(tc.args, "-verbose", "-yes", "example")
  1366  			code, err := cmd.parseAndValidate(args)
  1367  			must.NoError(t, err)
  1368  			must.Zero(t, code)
  1369  
  1370  			got := cmd.filterAllocs(allAllocs)
  1371  			must.SliceEqFunc(t, tc.expectedAllocs, got, func(a, b AllocationListStubWithJob) bool {
  1372  				return a.ID == b.ID
  1373  			})
  1374  
  1375  			expected := set.FromFunc(tc.expectedAllocs, func(a AllocationListStubWithJob) string {
  1376  				return a.ID
  1377  			})
  1378  			for _, a := range allAllocs {
  1379  				if !expected.Contains(a.ID) {
  1380  					must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("Skipping allocation %q", a.ID))
  1381  				}
  1382  			}
  1383  		})
  1384  	}
  1385  }
  1386  
  1387  func TestJobRestartCommand_onErrorFail(t *testing.T) {
  1388  	ci.Parallel(t)
  1389  
  1390  	ui := cli.NewMockUi()
  1391  	cmd := &JobRestartCommand{Meta: Meta{Ui: ui}}
  1392  
  1393  	// Start client and server and wait for node to be ready.
  1394  	srv, client, url := testServer(t, true, nil)
  1395  	defer srv.Shutdown()
  1396  
  1397  	parsedURL, err := neturl.Parse(url)
  1398  	must.NoError(t, err)
  1399  
  1400  	waitForNodes(t, client)
  1401  
  1402  	// Register a job with 3 allocations.
  1403  	jobID := "test_job_restart_command_fail_on_error"
  1404  	job := testJob(jobID)
  1405  	job.TaskGroups[0].Count = pointer.Of(3)
  1406  
  1407  	resp, _, err := client.Jobs().Register(job, nil)
  1408  	must.NoError(t, err)
  1409  
  1410  	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
  1411  	must.Zero(t, code)
  1412  	ui.OutputWriter.Reset()
  1413  
  1414  	// Create a proxy to inject an error after 2 allocation restarts.
  1415  	// Also counts how many restart requests are made so we can check that the
  1416  	// command stops after the error happens.
  1417  	var allocRestarts int32
  1418  	proxy := httptest.NewServer(&httputil.ReverseProxy{
  1419  		ModifyResponse: func(resp *http.Response) error {
  1420  			if strings.HasSuffix(resp.Request.URL.Path, "/restart") {
  1421  				count := atomic.AddInt32(&allocRestarts, 1)
  1422  				if count == 2 {
  1423  					return fmt.Errorf("fail")
  1424  				}
  1425  			}
  1426  			return nil
  1427  		},
  1428  		Rewrite: func(r *httputil.ProxyRequest) {
  1429  			r.SetURL(parsedURL)
  1430  		},
  1431  	})
  1432  	defer proxy.Close()
  1433  
  1434  	// Run command with -fail-on-error.
  1435  	// Expect only 2 restarts requests even though there are 3 allocations.
  1436  	code = cmd.Run([]string{
  1437  		"-address", proxy.URL,
  1438  		"-on-error", jobRestartOnErrorFail,
  1439  		jobID,
  1440  	})
  1441  	must.One(t, code)
  1442  	must.Eq(t, 2, allocRestarts)
  1443  }
  1444  
  1445  // waitTasksRestarted blocks until the given allocations have restarted or not.
  1446  // Returns a list with updated state of the allocations.
  1447  //
  1448  // To determine if a restart happened the function looks for a "Restart
  1449  // Signaled" event in the list of task events. Allocations that are reused
  1450  // between tests may contain a restart event from a past test case, leading to
  1451  // false positives.
  1452  //
  1453  // The restarts map contains values structured as group:task:<expect restart?>.
  1454  func waitTasksRestarted(
  1455  	t *testing.T,
  1456  	client *api.Client,
  1457  	allocs []*api.AllocationListStub,
  1458  	restarts map[string]map[string]bool,
  1459  ) []*api.Allocation {
  1460  	t.Helper()
  1461  
  1462  	var newAllocs []*api.Allocation
  1463  	testutil.WaitForResult(func() (bool, error) {
  1464  		newAllocs = make([]*api.Allocation, 0, len(allocs))
  1465  
  1466  		for _, alloc := range allocs {
  1467  			if _, ok := restarts[alloc.TaskGroup]; !ok {
  1468  				t.Fatalf("Missing group %q in restarts map", alloc.TaskGroup)
  1469  			}
  1470  
  1471  			// Skip allocations that are not supposed to be running.
  1472  			if alloc.DesiredStatus != api.AllocDesiredStatusRun {
  1473  				continue
  1474  			}
  1475  
  1476  			updated, _, err := client.Allocations().Info(alloc.ID, nil)
  1477  			if err != nil {
  1478  				return false, err
  1479  			}
  1480  			newAllocs = append(newAllocs, updated)
  1481  
  1482  			for task, state := range updated.TaskStates {
  1483  				restarted := false
  1484  				for _, ev := range state.Events {
  1485  					if ev.Type == api.TaskRestartSignal {
  1486  						restarted = true
  1487  						break
  1488  					}
  1489  				}
  1490  
  1491  				if restarted && !restarts[updated.TaskGroup][task] {
  1492  					return false, fmt.Errorf(
  1493  						"task %q in alloc %s for group %q not expected to restart",
  1494  						task, updated.ID, updated.TaskGroup,
  1495  					)
  1496  				}
  1497  				if !restarted && restarts[updated.TaskGroup][task] {
  1498  					return false, fmt.Errorf(
  1499  						"task %q in alloc %s for group %q expected to restart but didn't",
  1500  						task, updated.ID, updated.TaskGroup,
  1501  					)
  1502  				}
  1503  			}
  1504  		}
  1505  		return true, nil
  1506  	}, func(err error) {
  1507  		must.NoError(t, err)
  1508  	})
  1509  
  1510  	return newAllocs
  1511  }
  1512  
  1513  // waitAllocsRescheduled blocks until the given allocations have been
  1514  // rescueduled or not. Returns a list with updated state of the allocations.
  1515  //
  1516  // To determined if an allocation has been rescheduled the function looks for
  1517  // a non-empty NextAllocation field.
  1518  //
  1519  // The reschedules map maps allocation IDs to a boolean indicating if a
  1520  // reschedule is expected for that allocation.
  1521  func waitAllocsRescheduled(t *testing.T, client *api.Client, reschedules map[string]bool) []*api.Allocation {
  1522  	t.Helper()
  1523  
  1524  	var newAllocs []*api.Allocation
  1525  	testutil.WaitForResult(func() (bool, error) {
  1526  		newAllocs = make([]*api.Allocation, 0, len(reschedules))
  1527  
  1528  		for allocID, reschedule := range reschedules {
  1529  			alloc, _, err := client.Allocations().Info(allocID, nil)
  1530  			if err != nil {
  1531  				return false, err
  1532  			}
  1533  			newAllocs = append(newAllocs, alloc)
  1534  
  1535  			wasRescheduled := alloc.NextAllocation != ""
  1536  			if wasRescheduled && !reschedule {
  1537  				return false, fmt.Errorf("alloc %s not expected to be rescheduled", alloc.ID)
  1538  			}
  1539  			if !wasRescheduled && reschedule {
  1540  				return false, fmt.Errorf("alloc %s expected to be rescheduled but wasn't", alloc.ID)
  1541  			}
  1542  		}
  1543  		return true, nil
  1544  	}, func(err error) {
  1545  		must.NoError(t, err)
  1546  	})
  1547  
  1548  	return newAllocs
  1549  }
  1550  
  1551  // getRestartBatches returns a list of allocations per batch of restarts.
  1552  //
  1553  // Since restarts are issued concurrently, it's expected that allocations in
  1554  // the same batch have fairly close LastRestart times, so a 1s delay between
  1555  // restarts may be enough to indicate a new batch.
  1556  func getRestartBatches(allocs []*api.Allocation, groups []string, task string) [][]*api.Allocation {
  1557  	groupsSet := set.From(groups)
  1558  	batches := [][]*api.Allocation{}
  1559  
  1560  	type allocRestart struct {
  1561  		alloc   *api.Allocation
  1562  		restart time.Time
  1563  	}
  1564  
  1565  	restarts := make([]allocRestart, 0, len(allocs))
  1566  	for _, alloc := range allocs {
  1567  		if !groupsSet.Contains(alloc.TaskGroup) {
  1568  			continue
  1569  		}
  1570  
  1571  		restarts = append(restarts, allocRestart{
  1572  			alloc:   alloc,
  1573  			restart: alloc.TaskStates[task].LastRestart,
  1574  		})
  1575  	}
  1576  
  1577  	sort.Slice(restarts, func(i, j int) bool {
  1578  		return restarts[i].restart.Before(restarts[j].restart)
  1579  	})
  1580  
  1581  	prev := restarts[0].restart
  1582  	batch := []*api.Allocation{}
  1583  	for _, r := range restarts {
  1584  		if r.restart.Sub(prev) >= time.Second {
  1585  			prev = r.restart
  1586  			batches = append(batches, batch)
  1587  			batch = []*api.Allocation{}
  1588  		}
  1589  		batch = append(batch, r.alloc)
  1590  	}
  1591  	batches = append(batches, batch)
  1592  
  1593  	return batches
  1594  }