github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/rescheduling/rescheduling.go (about)

     1  package rescheduling
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"reflect"
     7  	"sort"
     8  	"time"
     9  
    10  	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
    11  	"github.com/hashicorp/nomad/e2e/framework"
    12  	"github.com/hashicorp/nomad/helper/uuid"
    13  	"github.com/hashicorp/nomad/jobspec"
    14  	"github.com/hashicorp/nomad/testutil"
    15  )
    16  
    17  const ns = ""
    18  
    19  type RescheduleE2ETest struct {
    20  	framework.TC
    21  	jobIds []string
    22  }
    23  
    24  func init() {
    25  	framework.AddSuites(&framework.TestSuite{
    26  		Component:   "Rescheduling",
    27  		CanRunLocal: true,
    28  		Consul:      true,
    29  		Cases: []framework.TestCase{
    30  			new(RescheduleE2ETest),
    31  		},
    32  	})
    33  
    34  }
    35  
    36  func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
    37  	e2e.WaitForLeader(f.T(), tc.Nomad())
    38  	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
    39  }
    40  
    41  func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
    42  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    43  		return
    44  	}
    45  
    46  	for _, id := range tc.jobIds {
    47  		err := e2e.StopJob(id, "-purge")
    48  		f.Assert().NoError(err)
    49  	}
    50  	tc.jobIds = []string{}
    51  	_, err := e2e.Command("nomad", "system", "gc")
    52  	f.Assert().NoError(err)
    53  }
    54  
    55  // TestNoReschedule runs a job that should fail and never reschedule
    56  func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
    57  	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
    58  	f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
    59  	tc.jobIds = append(tc.jobIds, jobID)
    60  
    61  	expected := []string{"failed", "failed", "failed"}
    62  	f.NoError(
    63  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
    64  		"should have exactly 3 failed allocs",
    65  	)
    66  }
    67  
    68  // TestNoRescheduleSystem runs a system job that should fail and never reschedule
    69  func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
    70  	jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
    71  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
    72  	tc.jobIds = append(tc.jobIds, jobID)
    73  
    74  	f.NoError(
    75  		e2e.WaitForAllocStatusComparison(
    76  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
    77  			func(got []string) bool {
    78  				for _, status := range got {
    79  					if status != "failed" {
    80  						return false
    81  					}
    82  				}
    83  				return true
    84  			}, nil,
    85  		),
    86  		"should have only failed allocs",
    87  	)
    88  }
    89  
    90  // TestDefaultReschedule runs a job that should reschedule after delay
    91  func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
    92  
    93  	jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
    94  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
    95  	tc.jobIds = append(tc.jobIds, jobID)
    96  
    97  	expected := []string{"failed", "failed", "failed"}
    98  	f.NoError(
    99  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   100  		"should have exactly 3 failed allocs",
   101  	)
   102  
   103  	// TODO(tgross): return early if "slow" isn't set
   104  	// wait until first exponential delay kicks in and rescheduling is attempted
   105  	time.Sleep(time.Second * 35)
   106  	expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
   107  	f.NoError(
   108  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   109  		"should have exactly 6 failed allocs after 35s",
   110  	)
   111  }
   112  
   113  // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
   114  func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
   115  
   116  	jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
   117  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
   118  	tc.jobIds = append(tc.jobIds, jobID)
   119  
   120  	expected := []string{"failed", "failed", "failed"}
   121  	f.NoError(
   122  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   123  		"should have exactly 3 failed allocs",
   124  	)
   125  
   126  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
   127  	f.NoError(err)
   128  	job.ID = &jobID
   129  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
   130  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   131  	f.NoError(err, "could not register updated job")
   132  
   133  	f.NoError(
   134  		e2e.WaitForAllocStatusComparison(
   135  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   136  			func(got []string) bool {
   137  				for _, status := range got {
   138  					if status == "running" {
   139  						return true
   140  					}
   141  				}
   142  				return false
   143  			}, nil,
   144  		),
   145  		"should have at least 1 running alloc",
   146  	)
   147  }
   148  
   149  // TestRescheduleSuccess runs a job that should be running after rescheduling
   150  func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
   151  
   152  	jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
   153  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
   154  	tc.jobIds = append(tc.jobIds, jobID)
   155  
   156  	f.NoError(
   157  		e2e.WaitForAllocStatusComparison(
   158  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   159  			func(got []string) bool {
   160  				for _, status := range got {
   161  					if status == "running" {
   162  						return true
   163  					}
   164  				}
   165  				return false
   166  			}, nil,
   167  		),
   168  		"should have at least 1 running alloc",
   169  	)
   170  }
   171  
   172  // TestRescheduleWithUpdate updates a running job to fail, and verifies that
   173  // it gets rescheduled
   174  func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
   175  
   176  	jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
   177  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
   178  	tc.jobIds = append(tc.jobIds, jobID)
   179  
   180  	expected := []string{"running", "running", "running"}
   181  	f.NoError(
   182  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   183  		"should have exactly 3 running allocs",
   184  	)
   185  
   186  	// reschedule to make fail
   187  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
   188  	f.NoError(err)
   189  	job.ID = &jobID
   190  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   191  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   192  	f.NoError(err, "could not register updated job")
   193  
   194  	f.NoError(
   195  		e2e.WaitForAllocStatusComparison(
   196  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   197  			func(got []string) bool { return len(got) > 0 }, nil,
   198  		),
   199  		"should have rescheduled allocs until progress deadline",
   200  	)
   201  }
   202  
   203  // TestRescheduleWithCanary updates a running job to fail, and verify that the
   204  // canary gets rescheduled
   205  func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
   206  
   207  	jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
   208  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
   209  	tc.jobIds = append(tc.jobIds, jobID)
   210  
   211  	expected := []string{"running", "running", "running"}
   212  	f.NoError(
   213  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   214  		"should have exactly 3 running allocs",
   215  	)
   216  
   217  	f.NoError(
   218  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   219  		"deployment should be successful")
   220  
   221  	// reschedule to make fail
   222  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
   223  	f.NoError(err)
   224  	job.ID = &jobID
   225  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   226  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   227  	f.NoError(err, "could not register updated job")
   228  
   229  	f.NoError(
   230  		e2e.WaitForAllocStatusComparison(
   231  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   232  			func(got []string) bool { return len(got) > 0 }, nil,
   233  		),
   234  		"should have rescheduled allocs until progress deadline",
   235  	)
   236  
   237  	f.NoError(
   238  		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
   239  		"deployment should be running")
   240  }
   241  
   242  // TestRescheduleWithCanaryAutoRevert updates a running job to fail, and
   243  // verifies that the job gets reverted.
   244  func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
   245  
   246  	jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
   247  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
   248  	tc.jobIds = append(tc.jobIds, jobID)
   249  
   250  	expected := []string{"running", "running", "running"}
   251  	f.NoError(
   252  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   253  		"should have exactly 3 running allocs",
   254  	)
   255  
   256  	f.NoError(
   257  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   258  		"deployment should be successful")
   259  
   260  	// reschedule to make fail
   261  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
   262  	f.NoError(err)
   263  	job.ID = &jobID
   264  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   265  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   266  	f.NoError(err, "could not register updated job")
   267  
   268  	f.NoError(
   269  		e2e.WaitForAllocStatusComparison(
   270  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   271  			func(got []string) bool { return len(got) > 0 }, nil,
   272  		),
   273  		"should have new allocs after update",
   274  	)
   275  
   276  	// then we'll fail and revert
   277  	expected = []string{"failed", "failed", "failed", "running", "running", "running"}
   278  	f.NoError(
   279  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   280  		"should have exactly 3 running reverted allocs",
   281  	)
   282  
   283  	f.NoError(
   284  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   285  		"deployment should be successful")
   286  }
   287  
   288  // TestRescheduleMaxParallel updates a job with a max_parallel config
   289  func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
   290  
   291  	jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
   292  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
   293  	tc.jobIds = append(tc.jobIds, jobID)
   294  
   295  	expected := []string{"running", "running", "running"}
   296  	f.NoError(
   297  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   298  		"should have exactly 3 running allocs",
   299  	)
   300  
   301  	f.NoError(
   302  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   303  		"deployment should be successful")
   304  
   305  	// reschedule to make fail
   306  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
   307  	f.NoError(err)
   308  	job.ID = &jobID
   309  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   310  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   311  	f.NoError(err, "could not register updated job")
   312  
   313  	expected = []string{"complete", "failed", "failed", "running", "running"}
   314  
   315  	f.NoError(
   316  		e2e.WaitForAllocStatusComparison(
   317  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   318  			func(got []string) bool {
   319  				sort.Strings(got)
   320  				return reflect.DeepEqual(got, expected)
   321  			}, nil,
   322  		),
   323  		"should have failed allocs including rescheduled failed allocs",
   324  	)
   325  
   326  	f.NoError(
   327  		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
   328  		"deployment should be running")
   329  }
   330  
   331  // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
   332  // config that will autorevert on failure
   333  func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
   334  
   335  	jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
   336  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
   337  	tc.jobIds = append(tc.jobIds, jobID)
   338  
   339  	expected := []string{"running", "running", "running"}
   340  	f.NoError(
   341  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   342  		"should have exactly 3 running allocs",
   343  	)
   344  
   345  	f.NoError(
   346  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   347  		"deployment should be successful")
   348  
   349  	// reschedule to make fail
   350  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
   351  	f.NoError(err)
   352  	job.ID = &jobID
   353  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   354  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   355  	f.NoError(err, "could not e2e.Register updated job")
   356  
   357  	f.NoError(
   358  		e2e.WaitForAllocStatusComparison(
   359  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   360  			func(got []string) bool { return len(got) > 0 }, nil,
   361  		),
   362  		"should have new allocs after update",
   363  	)
   364  
   365  	// wait for the revert
   366  	expected = []string{"complete", "failed", "running", "running", "running"}
   367  	f.NoError(
   368  		e2e.WaitForAllocStatusComparison(
   369  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   370  			func(got []string) bool {
   371  				sort.Strings(got)
   372  				return reflect.DeepEqual(got, expected)
   373  			}, nil,
   374  		),
   375  		"should have one successful, one failed, and 3 reverted allocs",
   376  	)
   377  
   378  	// at this point the allocs have been checked but we need to wait for the
   379  	// deployment to be marked complete before we can assert that it's successful
   380  	// and verify the count of deployments
   381  	f.NoError(
   382  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   383  		"most recent deployment should be successful")
   384  
   385  	out, err := e2e.Command("nomad", "deployment", "status")
   386  	f.NoError(err, "could not get deployment status")
   387  
   388  	results, err := e2e.ParseColumns(out)
   389  	f.NoError(err, "could not parse deployment status")
   390  	statuses := map[string]int{}
   391  	for _, row := range results {
   392  		if row["Job ID"] == jobID {
   393  			statuses[row["Status"]]++
   394  		}
   395  	}
   396  
   397  	f.Equal(1, statuses["failed"],
   398  		fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out))
   399  	f.Equal(2, statuses["successful"],
   400  		fmt.Sprintf("expected 2 successful deployments, got:\n%s", out))
   401  }
   402  
   403  // TestRescheduleProgressDeadline verifies the progress deadline is reset with
   404  // each healthy allocation, and that a rescheduled allocation does not.
   405  func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
   406  
   407  	jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
   408  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
   409  	tc.jobIds = append(tc.jobIds, jobID)
   410  
   411  	expected := []string{"running"}
   412  	f.NoError(
   413  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   414  		"should have a running allocation",
   415  	)
   416  
   417  	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
   418  	f.NoError(err, "couldn't look up deployment")
   419  
   420  	oldDeadline, err := getProgressDeadline(deploymentID)
   421  	f.NoError(err, "could not get progress deadline")
   422  	time.Sleep(time.Second * 20)
   423  
   424  	newDeadline, err := getProgressDeadline(deploymentID)
   425  	f.NoError(err, "could not get new progress deadline")
   426  	f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated")
   427  
   428  	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   429  		"deployment should be successful")
   430  }
   431  
   432  // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with
   433  // each healthy allocation, and that a rescheduled allocation does not.
   434  func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) {
   435  
   436  	jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
   437  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad"))
   438  	tc.jobIds = append(tc.jobIds, jobID)
   439  
   440  	testutil.WaitForResult(func() (bool, error) {
   441  		_, err := e2e.LastDeploymentID(jobID, ns)
   442  		return err == nil, err
   443  	}, func(err error) {
   444  		f.NoError(err, "deployment wasn't created yet")
   445  	})
   446  
   447  	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
   448  	f.NoError(err, "couldn't look up deployment")
   449  
   450  	oldDeadline, err := getProgressDeadline(deploymentID)
   451  	f.NoError(err, "could not get progress deadline")
   452  	time.Sleep(time.Second * 20)
   453  
   454  	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil),
   455  		"deployment should be failed")
   456  
   457  	f.NoError(
   458  		e2e.WaitForAllocStatusComparison(
   459  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   460  			func(got []string) bool {
   461  				for _, status := range got {
   462  					if status != "failed" {
   463  						return false
   464  					}
   465  				}
   466  				return true
   467  			}, nil,
   468  		),
   469  		"should have only failed allocs",
   470  	)
   471  
   472  	newDeadline, err := getProgressDeadline(deploymentID)
   473  	f.NoError(err, "could not get new progress deadline")
   474  	f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated")
   475  }
   476  
   477  func getProgressDeadline(deploymentID string) (time.Time, error) {
   478  
   479  	out, err := e2e.Command("nomad", "deployment", "status", deploymentID)
   480  	if err != nil {
   481  		return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out)
   482  	}
   483  
   484  	section, err := e2e.GetSection(out, "Deployed")
   485  	if err != nil {
   486  		return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err)
   487  	}
   488  
   489  	rows, err := e2e.ParseColumns(section)
   490  	if err != nil {
   491  		return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err)
   492  	}
   493  
   494  	layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
   495  	raw := rows[0]["Progress Deadline"]
   496  	return time.Parse(layout, raw)
   497  }