github.com/bigcommerce/nomad@v0.9.3-bc/e2e/rescheduling/server_side_restarts_test.go (about)

     1  package rescheduling
     2  
     3  import (
     4  	"sort"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/api"
     8  	"github.com/hashicorp/nomad/jobspec"
     9  	. "github.com/onsi/ginkgo"
    10  	. "github.com/onsi/gomega"
    11  
    12  	"github.com/hashicorp/nomad/helper"
    13  	"github.com/hashicorp/nomad/helper/uuid"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  )
    16  
    17  var _ = Describe("Server Side Restart Tests", func() {
    18  
    19  	var (
    20  		jobs     *api.Jobs
    21  		system   *api.System
    22  		job      *api.Job
    23  		err      error
    24  		specFile string
    25  
    26  		// allocStatuses is a helper function that pulls
    27  		// out client statuses from a slice of allocs
    28  		allocStatuses = func() []string {
    29  			allocs, _, err := jobs.Allocations(*job.ID, false, nil)
    30  			Expect(err).ShouldNot(HaveOccurred())
    31  			var ret []string
    32  			for _, a := range allocs {
    33  				ret = append(ret, a.ClientStatus)
    34  			}
    35  			sort.Strings(ret)
    36  			return ret
    37  		}
    38  
    39  		// allocStatusesRescheduled is a helper function that pulls
    40  		// out client statuses only from rescheduled allocs
    41  		allocStatusesRescheduled = func() []string {
    42  			allocs, _, err := jobs.Allocations(*job.ID, false, nil)
    43  			Expect(err).ShouldNot(HaveOccurred())
    44  			var ret []string
    45  			for _, a := range allocs {
    46  				if (a.RescheduleTracker != nil && len(a.RescheduleTracker.Events) > 0) || a.FollowupEvalID != "" {
    47  					ret = append(ret, a.ClientStatus)
    48  				}
    49  			}
    50  			return ret
    51  		}
    52  
    53  		// deploymentStatus is a helper function that returns deployment status of all deployments
    54  		// sorted by time
    55  		deploymentStatus = func() []string {
    56  			deploys, _, err := jobs.Deployments(*job.ID, false, nil)
    57  			Expect(err).ShouldNot(HaveOccurred())
    58  			var ret []string
    59  			sort.Slice(deploys, func(i, j int) bool {
    60  				return deploys[i].CreateIndex < deploys[j].CreateIndex
    61  			})
    62  			for _, d := range deploys {
    63  				ret = append(ret, d.Status)
    64  			}
    65  			return ret
    66  		}
    67  	)
    68  
    69  	BeforeSuite(func() {
    70  		conf := api.DefaultConfig()
    71  
    72  		// Create client
    73  		client, err := api.NewClient(conf)
    74  		Expect(err).ShouldNot(HaveOccurred())
    75  		jobs = client.Jobs()
    76  		system = client.System()
    77  	})
    78  
    79  	JustBeforeEach(func() {
    80  		job, err = jobspec.ParseFile(specFile)
    81  		Expect(err).ShouldNot(HaveOccurred())
    82  		job.ID = helper.StringToPtr(uuid.Generate())
    83  		resp, _, err := jobs.Register(job, nil)
    84  		Expect(err).ShouldNot(HaveOccurred())
    85  		Expect(resp.EvalID).ShouldNot(BeEmpty())
    86  
    87  	})
    88  
    89  	AfterEach(func() {
    90  		//Deregister job
    91  		jobs.Deregister(*job.ID, true, nil)
    92  		system.GarbageCollect()
    93  	})
    94  
    95  	Describe("Reschedule Stanza Tests", func() {
    96  
    97  		Context("No reschedule attempts", func() {
    98  			BeforeEach(func() {
    99  				specFile = "input/norescheduling.hcl"
   100  			})
   101  
   102  			It("Should have exactly three allocs and all failed", func() {
   103  				Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"}))
   104  			})
   105  		})
   106  
   107  		Context("System jobs should never be rescheduled", func() {
   108  			BeforeEach(func() {
   109  				specFile = "input/rescheduling_system.hcl"
   110  			})
   111  
   112  			It("Should have exactly one failed alloc", func() {
   113  				Eventually(allocStatuses, 10*time.Second, time.Second).Should(ConsistOf([]string{"failed"}))
   114  			})
   115  		})
   116  
   117  		Context("Default Rescheduling", func() {
   118  			BeforeEach(func() {
   119  				specFile = "input/rescheduling_default.hcl"
   120  			})
   121  			It("Should have exactly three allocs and all failed after 5 secs", func() {
   122  				Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"}))
   123  			})
   124  			// wait until first exponential delay kicks in and rescheduling is attempted
   125  			It("Should have exactly six allocs and all failed after 35 secs", func() {
   126  				if !*slow {
   127  					Skip("Skipping slow test")
   128  				}
   129  				Eventually(allocStatuses, 35*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed", "failed", "failed", "failed"}))
   130  			})
   131  		})
   132  
   133  		Context("Reschedule attempts maxed out", func() {
   134  			BeforeEach(func() {
   135  				specFile = "input/rescheduling_fail.hcl"
   136  			})
   137  			It("Should have all failed", func() {
   138  				Eventually(allocStatuses, 6*time.Second, time.Second).ShouldNot(
   139  					SatisfyAll(ContainElement("pending"),
   140  						ContainElement("running")))
   141  			})
   142  			Context("Updating job to change its version", func() {
   143  				It("Should have running allocs now", func() {
   144  					job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}
   145  					_, _, err := jobs.Register(job, nil)
   146  					Expect(err).ShouldNot(HaveOccurred())
   147  					Eventually(allocStatuses, 5*time.Second, time.Second).Should(ContainElement("running"))
   148  				})
   149  			})
   150  		})
   151  
   152  		Context("Reschedule attempts succeeded", func() {
   153  			BeforeEach(func() {
   154  				specFile = "input/reschedule_success.hcl"
   155  			})
   156  			It("Should have some running allocs", func() {
   157  				Eventually(allocStatuses, 6*time.Second, time.Second).Should(
   158  					ContainElement("running"))
   159  			})
   160  		})
   161  
   162  		Context("Reschedule with update stanza", func() {
   163  			BeforeEach(func() {
   164  				specFile = "input/rescheduling_update.hcl"
   165  			})
   166  			It("Should have all running allocs", func() {
   167  				Eventually(allocStatuses, 3*time.Second, time.Second).Should(
   168  					ConsistOf([]string{"running", "running", "running"}))
   169  			})
   170  			Context("Updating job to make allocs fail", func() {
   171  				It("Should have rescheduled allocs until progress deadline", func() {
   172  					job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   173  					_, _, err := jobs.Register(job, nil)
   174  					Expect(err).ShouldNot(HaveOccurred())
   175  					Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty())
   176  				})
   177  			})
   178  
   179  		})
   180  
   181  		Context("Reschedule with canary", func() {
   182  			BeforeEach(func() {
   183  				specFile = "input/rescheduling_canary.hcl"
   184  			})
   185  			It("Should have running allocs and successful deployment", func() {
   186  				Eventually(allocStatuses, 3*time.Second, time.Second).Should(
   187  					ConsistOf([]string{"running", "running", "running"}))
   188  
   189  				time.Sleep(2 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
   190  				Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   191  					ContainElement(structs.DeploymentStatusSuccessful))
   192  			})
   193  
   194  			Context("Updating job to make allocs fail", func() {
   195  				It("Should have rescheduled allocs until progress deadline", func() {
   196  					job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   197  					_, _, err := jobs.Register(job, nil)
   198  					Expect(err).ShouldNot(HaveOccurred())
   199  					Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty())
   200  
   201  					// Verify new deployment and its status
   202  					// Deployment status should be running (because of progress deadline)
   203  					time.Sleep(3 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
   204  					Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   205  						ContainElement(structs.DeploymentStatusRunning))
   206  				})
   207  			})
   208  
   209  		})
   210  
   211  		Context("Reschedule with canary, auto revert with short progress deadline ", func() {
   212  			BeforeEach(func() {
   213  				specFile = "input/rescheduling_canary_autorevert.hcl"
   214  			})
   215  			It("Should have running allocs and successful deployment", func() {
   216  				Eventually(allocStatuses, 3*time.Second, time.Second).Should(
   217  					ConsistOf([]string{"running", "running", "running"}))
   218  
   219  				time.Sleep(2 * time.Second)
   220  				Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   221  					ContainElement(structs.DeploymentStatusSuccessful))
   222  
   223  				// Make an update that causes the job to fail
   224  				job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   225  				_, _, err := jobs.Register(job, nil)
   226  				Expect(err).ShouldNot(HaveOccurred())
   227  				Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
   228  
   229  				// Wait for the revert
   230  				Eventually(allocStatuses, 3*time.Second, time.Second).Should(
   231  					ConsistOf([]string{"failed", "failed", "failed", "running", "running", "running"}))
   232  				// Verify new deployment and its status
   233  				// There should be one successful, one failed, and one more successful (after revert)
   234  				time.Sleep(5 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
   235  				Eventually(deploymentStatus(), 5*time.Second, time.Second).Should(
   236  					ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
   237  			})
   238  
   239  		})
   240  
   241  		Context("Reschedule with max parallel/auto_revert false", func() {
   242  			BeforeEach(func() {
   243  				specFile = "input/rescheduling_maxp.hcl"
   244  			})
   245  			It("Should have running allocs and successful deployment", func() {
   246  				Eventually(allocStatuses, 3*time.Second, time.Second).Should(
   247  					ConsistOf([]string{"running", "running", "running"}))
   248  
   249  				time.Sleep(2 * time.Second)
   250  				Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   251  					ContainElement(structs.DeploymentStatusSuccessful))
   252  			})
   253  
   254  			Context("Updating job to make allocs fail", func() {
   255  				It("Should have rescheduled allocs till progress deadline", func() {
   256  					job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   257  					_, _, err := jobs.Register(job, nil)
   258  					Expect(err).ShouldNot(HaveOccurred())
   259  					Eventually(allocStatusesRescheduled, 6*time.Second, time.Second).ShouldNot(BeEmpty())
   260  
   261  					// Should have failed allocs including rescheduled failed allocs
   262  					Eventually(allocStatuses, 6*time.Second, time.Second).Should(
   263  						ConsistOf([]string{"complete", "failed", "failed", "running", "running"}))
   264  
   265  					// Verify new deployment and its status
   266  					Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   267  						ContainElement(structs.DeploymentStatusRunning))
   268  				})
   269  			})
   270  
   271  		})
   272  
   273  		Context("Reschedule with max parallel, auto revert true and short progress deadline", func() {
   274  			BeforeEach(func() {
   275  				specFile = "input/rescheduling_maxp_autorevert.hcl"
   276  			})
   277  			It("Should have running allocs and successful deployment", func() {
   278  				Eventually(allocStatuses, 3*time.Second, time.Second).Should(
   279  					ConsistOf([]string{"running", "running", "running"}))
   280  
   281  				time.Sleep(4 * time.Second)
   282  				Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   283  					ContainElement(structs.DeploymentStatusSuccessful))
   284  
   285  				// Make an update that causes the job to fail
   286  				job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
   287  				_, _, err := jobs.Register(job, nil)
   288  				Expect(err).ShouldNot(HaveOccurred())
   289  				Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
   290  
   291  				// Wait for the revert
   292  				Eventually(allocStatuses, 5*time.Second, time.Second).Should(
   293  					ConsistOf([]string{"complete", "failed", "running", "running", "running"}))
   294  
   295  				// Verify new deployment and its status
   296  				// There should be one successful, one failed, and one more successful (after revert)
   297  				time.Sleep(5 * time.Second)
   298  				Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
   299  					ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
   300  			})
   301  
   302  		})
   303  
   304  		Context("Reschedule with progress deadline", func() {
   305  			BeforeEach(func() {
   306  				specFile = "input/rescheduling_progressdeadline.hcl"
   307  			})
   308  			It("Should have running allocs and successful deployment", func() {
   309  				if !*slow {
   310  					Skip("Skipping slow test")
   311  				}
   312  				// Deployment should succeed eventually
   313  				time.Sleep(20 * time.Second)
   314  				Eventually(deploymentStatus(), 5*time.Second, time.Second).Should(
   315  					ContainElement(structs.DeploymentStatusSuccessful))
   316  
   317  			})
   318  
   319  		})
   320  
   321  	})
   322  
   323  })