github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/e2e/rescheduling/rescheduling.go (about)

     1  package rescheduling
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"reflect"
     7  	"sort"
     8  	"time"
     9  
    10  	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
    11  	"github.com/hashicorp/nomad/e2e/framework"
    12  	"github.com/hashicorp/nomad/helper/uuid"
    13  	"github.com/hashicorp/nomad/jobspec"
    14  )
    15  
    16  const ns = ""
    17  
    18  type RescheduleE2ETest struct {
    19  	framework.TC
    20  	jobIds []string
    21  }
    22  
    23  func init() {
    24  	framework.AddSuites(&framework.TestSuite{
    25  		Component:   "Rescheduling",
    26  		CanRunLocal: true,
    27  		Consul:      true,
    28  		Cases: []framework.TestCase{
    29  			new(RescheduleE2ETest),
    30  		},
    31  	})
    32  
    33  }
    34  
    35  func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) {
    36  	e2e.WaitForLeader(f.T(), tc.Nomad())
    37  	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1)
    38  }
    39  
    40  func (tc *RescheduleE2ETest) AfterEach(f *framework.F) {
    41  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    42  		return
    43  	}
    44  
    45  	for _, id := range tc.jobIds {
    46  		_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
    47  		f.Assert().NoError(err)
    48  	}
    49  	tc.jobIds = []string{}
    50  	_, err := e2e.Command("nomad", "system", "gc")
    51  	f.Assert().NoError(err)
    52  }
    53  
    54  // TestNoReschedule runs a job that should fail and never reschedule
    55  func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) {
    56  	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
    57  	f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad"))
    58  	tc.jobIds = append(tc.jobIds, jobID)
    59  
    60  	expected := []string{"failed", "failed", "failed"}
    61  	f.NoError(
    62  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
    63  		"should have exactly 3 failed allocs",
    64  	)
    65  }
    66  
    67  // TestNoRescheduleSystem runs a system job that should fail and never reschedule
    68  func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) {
    69  	jobID := "test-reschedule-system-" + uuid.Generate()[0:8]
    70  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad"))
    71  	tc.jobIds = append(tc.jobIds, jobID)
    72  
    73  	f.NoError(
    74  		e2e.WaitForAllocStatusComparison(
    75  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
    76  			func(got []string) bool {
    77  				for _, status := range got {
    78  					if status != "failed" {
    79  						return false
    80  					}
    81  				}
    82  				return true
    83  			}, nil,
    84  		),
    85  		"should have only failed allocs",
    86  	)
    87  }
    88  
    89  // TestDefaultReschedule runs a job that should reschedule after delay
    90  func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) {
    91  
    92  	jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
    93  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad"))
    94  	tc.jobIds = append(tc.jobIds, jobID)
    95  
    96  	expected := []string{"failed", "failed", "failed"}
    97  	f.NoError(
    98  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
    99  		"should have exactly 3 failed allocs",
   100  	)
   101  
   102  	// TODO(tgross): return early if "slow" isn't set
   103  	// wait until first exponential delay kicks in and rescheduling is attempted
   104  	time.Sleep(time.Second * 35)
   105  	expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
   106  	f.NoError(
   107  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   108  		"should have exactly 6 failed allocs after 35s",
   109  	)
   110  }
   111  
   112  // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts
   113  func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) {
   114  
   115  	jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
   116  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad"))
   117  	tc.jobIds = append(tc.jobIds, jobID)
   118  
   119  	expected := []string{"failed", "failed", "failed"}
   120  	f.NoError(
   121  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   122  		"should have exactly 3 failed allocs",
   123  	)
   124  
   125  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad")
   126  	f.NoError(err)
   127  	job.ID = &jobID
   128  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
   129  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   130  	f.NoError(err, "could not register updated job")
   131  
   132  	f.NoError(
   133  		e2e.WaitForAllocStatusComparison(
   134  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   135  			func(got []string) bool {
   136  				for _, status := range got {
   137  					if status == "running" {
   138  						return true
   139  					}
   140  				}
   141  				return false
   142  			}, nil,
   143  		),
   144  		"should have at least 1 running alloc",
   145  	)
   146  }
   147  
   148  // TestRescheduleSuccess runs a job that should be running after rescheduling
   149  func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) {
   150  
   151  	jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
   152  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad"))
   153  	tc.jobIds = append(tc.jobIds, jobID)
   154  
   155  	f.NoError(
   156  		e2e.WaitForAllocStatusComparison(
   157  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   158  			func(got []string) bool {
   159  				for _, status := range got {
   160  					if status == "running" {
   161  						return true
   162  					}
   163  				}
   164  				return false
   165  			}, nil,
   166  		),
   167  		"should have at least 1 running alloc",
   168  	)
   169  }
   170  
   171  // TestRescheduleWithUpdate updates a running job to fail, and verifies that
   172  // it gets rescheduled
   173  func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) {
   174  
   175  	jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
   176  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad"))
   177  	tc.jobIds = append(tc.jobIds, jobID)
   178  
   179  	expected := []string{"running", "running", "running"}
   180  	f.NoError(
   181  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   182  		"should have exactly 3 running allocs",
   183  	)
   184  
   185  	// reschedule to make fail
   186  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad")
   187  	f.NoError(err)
   188  	job.ID = &jobID
   189  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   190  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   191  	f.NoError(err, "could not register updated job")
   192  
   193  	f.NoError(
   194  		e2e.WaitForAllocStatusComparison(
   195  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   196  			func(got []string) bool { return len(got) > 0 }, nil,
   197  		),
   198  		"should have rescheduled allocs until progress deadline",
   199  	)
   200  }
   201  
   202  // TestRescheduleWithCanary updates a running job to fail, and verify that the
   203  // canary gets rescheduled
   204  func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) {
   205  
   206  	jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
   207  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad"))
   208  	tc.jobIds = append(tc.jobIds, jobID)
   209  
   210  	expected := []string{"running", "running", "running"}
   211  	f.NoError(
   212  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   213  		"should have exactly 3 running allocs",
   214  	)
   215  
   216  	f.NoError(
   217  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   218  		"deployment should be successful")
   219  
   220  	// reschedule to make fail
   221  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad")
   222  	f.NoError(err)
   223  	job.ID = &jobID
   224  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   225  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   226  	f.NoError(err, "could not register updated job")
   227  
   228  	f.NoError(
   229  		e2e.WaitForAllocStatusComparison(
   230  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   231  			func(got []string) bool { return len(got) > 0 }, nil,
   232  		),
   233  		"should have rescheduled allocs until progress deadline",
   234  	)
   235  
   236  	f.NoError(
   237  		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
   238  		"deployment should be running")
   239  }
   240  
   241  // TestRescheduleWithCanary updates a running job to fail, and verifies that
   242  // the job gets reverted
   243  func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) {
   244  
   245  	jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
   246  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad"))
   247  	tc.jobIds = append(tc.jobIds, jobID)
   248  
   249  	expected := []string{"running", "running", "running"}
   250  	f.NoError(
   251  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   252  		"should have exactly 3 running allocs",
   253  	)
   254  
   255  	f.NoError(
   256  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   257  		"deployment should be successful")
   258  
   259  	// reschedule to make fail
   260  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad")
   261  	f.NoError(err)
   262  	job.ID = &jobID
   263  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   264  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   265  	f.NoError(err, "could not register updated job")
   266  
   267  	f.NoError(
   268  		e2e.WaitForAllocStatusComparison(
   269  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   270  			func(got []string) bool { return len(got) > 0 }, nil,
   271  		),
   272  		"should have new allocs after update",
   273  	)
   274  
   275  	// then we'll fail and revert
   276  	expected = []string{"failed", "failed", "failed", "running", "running", "running"}
   277  	f.NoError(
   278  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   279  		"should have exactly 3 running reverted allocs",
   280  	)
   281  
   282  	f.NoError(
   283  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   284  		"deployment should be successful")
   285  }
   286  
   287  // TestRescheduleMaxParallel updates a job with a max_parallel config
   288  func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) {
   289  
   290  	jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
   291  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad"))
   292  	tc.jobIds = append(tc.jobIds, jobID)
   293  
   294  	expected := []string{"running", "running", "running"}
   295  	f.NoError(
   296  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   297  		"should have exactly 3 running allocs",
   298  	)
   299  
   300  	f.NoError(
   301  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   302  		"deployment should be successful")
   303  
   304  	// reschedule to make fail
   305  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad")
   306  	f.NoError(err)
   307  	job.ID = &jobID
   308  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   309  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   310  	f.NoError(err, "could not register updated job")
   311  
   312  	expected = []string{"complete", "failed", "failed", "running", "running"}
   313  
   314  	f.NoError(
   315  		e2e.WaitForAllocStatusComparison(
   316  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   317  			func(got []string) bool {
   318  				sort.Strings(got)
   319  				return reflect.DeepEqual(got, expected)
   320  			}, nil,
   321  		),
   322  		"should have failed allocs including rescheduled failed allocs",
   323  	)
   324  
   325  	f.NoError(
   326  		e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
   327  		"deployment should be running")
   328  }
   329  
   330  // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel
   331  // config that will autorevert on failure
   332  func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) {
   333  
   334  	jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
   335  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad"))
   336  	tc.jobIds = append(tc.jobIds, jobID)
   337  
   338  	expected := []string{"running", "running", "running"}
   339  	f.NoError(
   340  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   341  		"should have exactly 3 running allocs",
   342  	)
   343  
   344  	f.NoError(
   345  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   346  		"deployment should be successful")
   347  
   348  	// reschedule to make fail
   349  	job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad")
   350  	f.NoError(err)
   351  	job.ID = &jobID
   352  	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   353  	_, _, err = tc.Nomad().Jobs().Register(job, nil)
   354  	f.NoError(err, "could not e2e.Register updated job")
   355  
   356  	f.NoError(
   357  		e2e.WaitForAllocStatusComparison(
   358  			func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) },
   359  			func(got []string) bool { return len(got) > 0 }, nil,
   360  		),
   361  		"should have new allocs after update",
   362  	)
   363  
   364  	// wait for the revert
   365  	expected = []string{"complete", "failed", "running", "running", "running"}
   366  	f.NoError(
   367  		e2e.WaitForAllocStatusComparison(
   368  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   369  			func(got []string) bool {
   370  				sort.Strings(got)
   371  				return reflect.DeepEqual(got, expected)
   372  			}, nil,
   373  		),
   374  		"should have one successful, one failed, and 3 reverted allocs",
   375  	)
   376  
   377  	// at this point the allocs have been checked but we need to wait for the
   378  	// deployment to be marked complete before we can assert that it's successful
   379  	// and verify the count of deployments
   380  	f.NoError(
   381  		e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   382  		"most recent deployment should be successful")
   383  
   384  	out, err := e2e.Command("nomad", "deployment", "status")
   385  	f.NoError(err, "could not get deployment status")
   386  
   387  	results, err := e2e.ParseColumns(out)
   388  	f.NoError(err, "could not parse deployment status")
   389  	statuses := map[string]int{}
   390  	for _, row := range results {
   391  		if row["Job ID"] == jobID {
   392  			statuses[row["Status"]]++
   393  		}
   394  	}
   395  
   396  	f.Equal(1, statuses["failed"],
   397  		fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out))
   398  	f.Equal(2, statuses["successful"],
   399  		fmt.Sprintf("expected 2 successful deployments, got:\n%s", out))
   400  }
   401  
   402  // TestRescheduleProgressDeadline verifies the progress deadline is reset with
   403  // each healthy allocation, and that a rescheduled allocation does not.
   404  func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) {
   405  
   406  	jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
   407  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad"))
   408  	tc.jobIds = append(tc.jobIds, jobID)
   409  
   410  	expected := []string{"running"}
   411  	f.NoError(
   412  		e2e.WaitForAllocStatusExpected(jobID, ns, expected),
   413  		"should have a running allocation",
   414  	)
   415  
   416  	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
   417  	f.NoError(err, "couldn't look up deployment")
   418  
   419  	oldDeadline, err := getProgressDeadline(deploymentID)
   420  	f.NoError(err, "could not get progress deadline")
   421  	time.Sleep(time.Second * 20)
   422  
   423  	newDeadline, err := getProgressDeadline(deploymentID)
   424  	f.NoError(err, "could not get new progress deadline")
   425  	f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated")
   426  
   427  	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
   428  		"deployment should be successful")
   429  }
   430  
   431  // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with
   432  // each healthy allocation, and that a rescheduled allocation does not.
   433  func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) {
   434  
   435  	jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
   436  	f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad"))
   437  	tc.jobIds = append(tc.jobIds, jobID)
   438  
   439  	deploymentID, err := e2e.LastDeploymentID(jobID, ns)
   440  	f.NoError(err, "couldn't look up deployment")
   441  
   442  	oldDeadline, err := getProgressDeadline(deploymentID)
   443  	f.NoError(err, "could not get progress deadline")
   444  	time.Sleep(time.Second * 20)
   445  
   446  	f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil),
   447  		"deployment should be failed")
   448  
   449  	f.NoError(
   450  		e2e.WaitForAllocStatusComparison(
   451  			func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) },
   452  			func(got []string) bool {
   453  				for _, status := range got {
   454  					if status != "failed" {
   455  						return false
   456  					}
   457  				}
   458  				return true
   459  			}, nil,
   460  		),
   461  		"should have only failed allocs",
   462  	)
   463  
   464  	newDeadline, err := getProgressDeadline(deploymentID)
   465  	f.NoError(err, "could not get new progress deadline")
   466  	f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated")
   467  }
   468  
   469  func getProgressDeadline(deploymentID string) (time.Time, error) {
   470  
   471  	out, err := e2e.Command("nomad", "deployment", "status", deploymentID)
   472  	if err != nil {
   473  		return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out)
   474  	}
   475  
   476  	section, err := e2e.GetSection(out, "Deployed")
   477  	if err != nil {
   478  		return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err)
   479  	}
   480  
   481  	rows, err := e2e.ParseColumns(section)
   482  	if err != nil {
   483  		return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err)
   484  	}
   485  
   486  	layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
   487  	raw := rows[0]["Progress Deadline"]
   488  	return time.Parse(layout, raw)
   489  }