github.com/hernad/nomad@v1.6.112/e2e/rescheduling/rescheduling.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package rescheduling
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"reflect"
    10  	"sort"
    11  	"time"
    12  
    13  	e2e "github.com/hernad/nomad/e2e/e2eutil"
    14  	"github.com/hernad/nomad/e2e/framework"
    15  	"github.com/hernad/nomad/helper/uuid"
    16  	"github.com/hernad/nomad/jobspec"
    17  	"github.com/hernad/nomad/testutil"
    18  )
    19  
    20  const ns = ""
    21  
    22  type RescheduleE2ETest struct {
    23  	framework.TC
    24  	jobIds []string
    25  }
    26  
    27  func init() {
    28  	framework.AddSuites(&framework.TestSuite{
    29  		Component:   "Rescheduling",
    30  		CanRunLocal: true,
    31  		Consul:      true,
    32  		Cases: []framework.TestCase{
    33  			new(RescheduleE2ETest),
    34  		},
    35  	})
    36  
    37  }
    38  
    39  func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
    40  	e2e.WaitForLeader(f.T(), tc.Nomad())
    41  	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
    42  }
    43  
    44  func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
    45  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    46  		return
    47  	}
    48  
    49  	for _, id := range tc.jobIds {
    50  		err := e2e.StopJob(id, "-purge")
    51  		f.Assert().NoError(err)
    52  	}
    53  	tc.jobIds = []string{}
    54  	_, err := e2e.Command("nomad", "system", "gc")
    55  	f.Assert().NoError(err)
    56  }
    57  
    58  // TestNoReschedule runs a job that should fail and never reschedule
    59  func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
    60  	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
    61  	f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
    62  	tc.jobIds = append(tc.jobIds, jobID)
    63  
    64  	expected := []string{"failed", "failed", "failed"}
    65  	f.NoError(
    66  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
    67  		"should have exactly 3 failed allocs",
    68  	)
    69  }
    70  
    71  // TestNoRescheduleSystem runs a system job that should fail and never reschedule
    72  func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
    73  	jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
    74  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
    75  	tc.jobIds = append(tc.jobIds, jobID)
    76  
    77  	f.NoError(
    78  		e2e.WaitForAllocStatusComparison(
    79  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
    80  			func(got []string) bool {
    81  				for _, status := range got {
    82  					if status != "failed" {
    83  						return false
    84  					}
    85  				}
    86  				return true
    87  			}, nil,
    88  		),
    89  		"should have only failed allocs",
    90  	)
    91  }
    92  
    93  // TestDefaultReschedule runs a job that should reschedule after delay
    94  func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
    95  
    96  	jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
    97  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
    98  	tc.jobIds = append(tc.jobIds, jobID)
    99  
   100  	expected := []string{"failed", "failed", "failed"}
   101  	f.NoError(
   102  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   103  		"should have exactly 3 failed allocs",
   104  	)
   105  
   106  	// TODO(tgross): return early if "slow" isn't set
   107  	// wait until first exponential delay kicks in and rescheduling is attempted
   108  	time.Sleep(time.Second * 35)
   109  	expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
   110  	f.NoError(
   111  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   112  		"should have exactly 6 failed allocs after 35s",
   113  	)
   114  }
   115  
   116  // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
   117  func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
   118  
   119  	jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
   120  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
   121  	tc.jobIds = append(tc.jobIds, jobID)
   122  
   123  	expected := []string{"failed", "failed", "failed"}
   124  	f.NoError(
   125  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   126  		"should have exactly 3 failed allocs",
   127  	)
   128  
   129  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
   130  	f.NoError(err)
   131  	job.ID = &jobID
   132  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
   133  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   134  	f.NoError(err, "could not register updated job")
   135  
   136  	f.NoError(
   137  		e2e.WaitForAllocStatusComparison(
   138  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   139  			func(got []string) bool {
   140  				for _, status := range got {
   141  					if status == "running" {
   142  						return true
   143  					}
   144  				}
   145  				return false
   146  			}, nil,
   147  		),
   148  		"should have at least 1 running alloc",
   149  	)
   150  }
   151  
   152  // TestRescheduleSuccess runs a job that should be running after rescheduling
   153  func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
   154  
   155  	jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
   156  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
   157  	tc.jobIds = append(tc.jobIds, jobID)
   158  
   159  	f.NoError(
   160  		e2e.WaitForAllocStatusComparison(
   161  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   162  			func(got []string) bool {
   163  				for _, status := range got {
   164  					if status == "running" {
   165  						return true
   166  					}
   167  				}
   168  				return false
   169  			}, nil,
   170  		),
   171  		"should have at least 1 running alloc",
   172  	)
   173  }
   174  
   175  // TestRescheduleWithUpdate updates a running job to fail, and verifies that
   176  // it gets rescheduled
   177  func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
   178  
   179  	jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
   180  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
   181  	tc.jobIds = append(tc.jobIds, jobID)
   182  
   183  	expected := []string{"running", "running", "running"}
   184  	f.NoError(
   185  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   186  		"should have exactly 3 running allocs",
   187  	)
   188  
   189  	// reschedule to make fail
   190  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
   191  	f.NoError(err)
   192  	job.ID = &jobID
   193  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   194  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   195  	f.NoError(err, "could not register updated job")
   196  
   197  	f.NoError(
   198  		e2e.WaitForAllocStatusComparison(
   199  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   200  			func(got []string) bool { return len(got) > 0 }, nil,
   201  		),
   202  		"should have rescheduled allocs until progress deadline",
   203  	)
   204  }
   205  
   206  // TestRescheduleWithCanary updates a running job to fail, and verify that the
   207  // canary gets rescheduled
   208  func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
   209  
   210  	jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
   211  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
   212  	tc.jobIds = append(tc.jobIds, jobID)
   213  
   214  	expected := []string{"running", "running", "running"}
   215  	f.NoError(
   216  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   217  		"should have exactly 3 running allocs",
   218  	)
   219  
   220  	f.NoError(
   221  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   222  		"deployment should be successful")
   223  
   224  	// reschedule to make fail
   225  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
   226  	f.NoError(err)
   227  	job.ID = &jobID
   228  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   229  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   230  	f.NoError(err, "could not register updated job")
   231  
   232  	f.NoError(
   233  		e2e.WaitForAllocStatusComparison(
   234  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   235  			func(got []string) bool { return len(got) > 0 }, nil,
   236  		),
   237  		"should have rescheduled allocs until progress deadline",
   238  	)
   239  
   240  	f.NoError(
   241  		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
   242  		"deployment should be running")
   243  }
   244  
   245  // TestRescheduleWithCanaryAutoRevert updates a running job to fail, and
   246  // verifies that the job gets reverted.
   247  func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
   248  
   249  	jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
   250  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
   251  	tc.jobIds = append(tc.jobIds, jobID)
   252  
   253  	expected := []string{"running", "running", "running"}
   254  	f.NoError(
   255  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   256  		"should have exactly 3 running allocs",
   257  	)
   258  
   259  	f.NoError(
   260  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   261  		"deployment should be successful")
   262  
   263  	// reschedule to make fail
   264  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
   265  	f.NoError(err)
   266  	job.ID = &jobID
   267  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   268  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   269  	f.NoError(err, "could not register updated job")
   270  
   271  	f.NoError(
   272  		e2e.WaitForAllocStatusComparison(
   273  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   274  			func(got []string) bool { return len(got) > 0 }, nil,
   275  		),
   276  		"should have new allocs after update",
   277  	)
   278  
   279  	// then we'll fail and revert
   280  	expected = []string{"failed", "failed", "failed", "running", "running", "running"}
   281  	f.NoError(
   282  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   283  		"should have exactly 3 running reverted allocs",
   284  	)
   285  
   286  	f.NoError(
   287  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   288  		"deployment should be successful")
   289  }
   290  
   291  // TestRescheduleMaxParallel updates a job with a max_parallel config
   292  func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
   293  
   294  	jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
   295  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
   296  	tc.jobIds = append(tc.jobIds, jobID)
   297  
   298  	expected := []string{"running", "running", "running"}
   299  	f.NoError(
   300  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   301  		"should have exactly 3 running allocs",
   302  	)
   303  
   304  	f.NoError(
   305  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   306  		"deployment should be successful")
   307  
   308  	// reschedule to make fail
   309  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
   310  	f.NoError(err)
   311  	job.ID = &jobID
   312  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   313  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   314  	f.NoError(err, "could not register updated job")
   315  
   316  	expected = []string{"complete", "failed", "failed", "running", "running"}
   317  
   318  	f.NoError(
   319  		e2e.WaitForAllocStatusComparison(
   320  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   321  			func(got []string) bool {
   322  				sort.Strings(got)
   323  				return reflect.DeepEqual(got, expected)
   324  			}, nil,
   325  		),
   326  		"should have failed allocs including rescheduled failed allocs",
   327  	)
   328  
   329  	f.NoError(
   330  		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
   331  		"deployment should be running")
   332  }
   333  
   334  // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
   335  // config that will autorevert on failure
   336  func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
   337  
   338  	jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
   339  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
   340  	tc.jobIds = append(tc.jobIds, jobID)
   341  
   342  	expected := []string{"running", "running", "running"}
   343  	f.NoError(
   344  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   345  		"should have exactly 3 running allocs",
   346  	)
   347  
   348  	f.NoError(
   349  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   350  		"deployment should be successful")
   351  
   352  	// reschedule to make fail
   353  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
   354  	f.NoError(err)
   355  	job.ID = &jobID
   356  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   357  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   358  	f.NoError(err, "could not e2e.Register updated job")
   359  
   360  	f.NoError(
   361  		e2e.WaitForAllocStatusComparison(
   362  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   363  			func(got []string) bool { return len(got) > 0 }, nil,
   364  		),
   365  		"should have new allocs after update",
   366  	)
   367  
   368  	// wait for the revert
   369  	expected = []string{"complete", "failed", "running", "running", "running"}
   370  	f.NoError(
   371  		e2e.WaitForAllocStatusComparison(
   372  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   373  			func(got []string) bool {
   374  				sort.Strings(got)
   375  				return reflect.DeepEqual(got, expected)
   376  			}, nil,
   377  		),
   378  		"should have one successful, one failed, and 3 reverted allocs",
   379  	)
   380  
   381  	// at this point the allocs have been checked but we need to wait for the
   382  	// deployment to be marked complete before we can assert that it's successful
   383  	// and verify the count of deployments
   384  	f.NoError(
   385  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   386  		"most recent deployment should be successful")
   387  
   388  	out, err := e2e.Command("nomad", "deployment", "status")
   389  	f.NoError(err, "could not get deployment status")
   390  
   391  	results, err := e2e.ParseColumns(out)
   392  	f.NoError(err, "could not parse deployment status")
   393  	statuses := map[string]int{}
   394  	for _, row := range results {
   395  		if row["Job ID"] == jobID {
   396  			statuses[row["Status"]]++
   397  		}
   398  	}
   399  
   400  	f.Equal(1, statuses["failed"],
   401  		fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out))
   402  	f.Equal(2, statuses["successful"],
   403  		fmt.Sprintf("expected 2 successful deployments, got:\n%s", out))
   404  }
   405  
   406  // TestRescheduleProgressDeadline verifies the progress deadline is reset with
   407  // each healthy allocation, and that a rescheduled allocation does not.
   408  func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
   409  
   410  	jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
   411  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
   412  	tc.jobIds = append(tc.jobIds, jobID)
   413  
   414  	expected := []string{"running"}
   415  	f.NoError(
   416  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   417  		"should have a running allocation",
   418  	)
   419  
   420  	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
   421  	f.NoError(err, "couldn't look up deployment")
   422  
   423  	oldDeadline, err := getProgressDeadline(deploymentID)
   424  	f.NoError(err, "could not get progress deadline")
   425  	time.Sleep(time.Second * 20)
   426  
   427  	newDeadline, err := getProgressDeadline(deploymentID)
   428  	f.NoError(err, "could not get new progress deadline")
   429  	f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated")
   430  
   431  	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   432  		"deployment should be successful")
   433  }
   434  
   435  // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with
   436  // each healthy allocation, and that a rescheduled allocation does not.
   437  func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) {
   438  
   439  	jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
   440  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad"))
   441  	tc.jobIds = append(tc.jobIds, jobID)
   442  
   443  	testutil.WaitForResult(func() (bool, error) {
   444  		_, err := e2e.LastDeploymentID(jobID, ns)
   445  		return err == nil, err
   446  	}, func(err error) {
   447  		f.NoError(err, "deployment wasn't created yet")
   448  	})
   449  
   450  	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
   451  	f.NoError(err, "couldn't look up deployment")
   452  
   453  	oldDeadline, err := getProgressDeadline(deploymentID)
   454  	f.NoError(err, "could not get progress deadline")
   455  	time.Sleep(time.Second * 20)
   456  
   457  	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil),
   458  		"deployment should be failed")
   459  
   460  	f.NoError(
   461  		e2e.WaitForAllocStatusComparison(
   462  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   463  			func(got []string) bool {
   464  				for _, status := range got {
   465  					if status != "failed" {
   466  						return false
   467  					}
   468  				}
   469  				return true
   470  			}, nil,
   471  		),
   472  		"should have only failed allocs",
   473  	)
   474  
   475  	newDeadline, err := getProgressDeadline(deploymentID)
   476  	f.NoError(err, "could not get new progress deadline")
   477  	f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated")
   478  }
   479  
   480  func getProgressDeadline(deploymentID string) (time.Time, error) {
   481  
   482  	out, err := e2e.Command("nomad", "deployment", "status", deploymentID)
   483  	if err != nil {
   484  		return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out)
   485  	}
   486  
   487  	section, err := e2e.GetSection(out, "Deployed")
   488  	if err != nil {
   489  		return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err)
   490  	}
   491  
   492  	rows, err := e2e.ParseColumns(section)
   493  	if err != nil {
   494  		return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err)
   495  	}
   496  
   497  	layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
   498  	raw := rows[0]["Progress Deadline"]
   499  	return time.Parse(layout, raw)
   500  }