github.com/bigcommerce/nomad@v0.9.3-bc/e2e/rescheduling/server_side_restarts_test.go (about) 1 package rescheduling 2 3 import ( 4 "sort" 5 "time" 6 7 "github.com/hashicorp/nomad/api" 8 "github.com/hashicorp/nomad/jobspec" 9 . "github.com/onsi/ginkgo" 10 . "github.com/onsi/gomega" 11 12 "github.com/hashicorp/nomad/helper" 13 "github.com/hashicorp/nomad/helper/uuid" 14 "github.com/hashicorp/nomad/nomad/structs" 15 ) 16 17 var _ = Describe("Server Side Restart Tests", func() { 18 19 var ( 20 jobs *api.Jobs 21 system *api.System 22 job *api.Job 23 err error 24 specFile string 25 26 // allocStatuses is a helper function that pulls 27 // out client statuses from a slice of allocs 28 allocStatuses = func() []string { 29 allocs, _, err := jobs.Allocations(*job.ID, false, nil) 30 Expect(err).ShouldNot(HaveOccurred()) 31 var ret []string 32 for _, a := range allocs { 33 ret = append(ret, a.ClientStatus) 34 } 35 sort.Strings(ret) 36 return ret 37 } 38 39 // allocStatusesRescheduled is a helper function that pulls 40 // out client statuses only from rescheduled allocs 41 allocStatusesRescheduled = func() []string { 42 allocs, _, err := jobs.Allocations(*job.ID, false, nil) 43 Expect(err).ShouldNot(HaveOccurred()) 44 var ret []string 45 for _, a := range allocs { 46 if (a.RescheduleTracker != nil && len(a.RescheduleTracker.Events) > 0) || a.FollowupEvalID != "" { 47 ret = append(ret, a.ClientStatus) 48 } 49 } 50 return ret 51 } 52 53 // deploymentStatus is a helper function that returns deployment status of all deployments 54 // sorted by time 55 deploymentStatus = func() []string { 56 deploys, _, err := jobs.Deployments(*job.ID, false, nil) 57 Expect(err).ShouldNot(HaveOccurred()) 58 var ret []string 59 sort.Slice(deploys, func(i, j int) bool { 60 return deploys[i].CreateIndex < deploys[j].CreateIndex 61 }) 62 for _, d := range deploys { 63 ret = append(ret, d.Status) 64 } 65 return ret 66 } 67 ) 68 69 BeforeSuite(func() { 70 conf := api.DefaultConfig() 71 72 // Create client 73 client, err := api.NewClient(conf) 74 Expect(err).ShouldNot(HaveOccurred()) 75 jobs = client.Jobs() 76 system = client.System() 77 }) 78 79 JustBeforeEach(func() { 80 job, err = jobspec.ParseFile(specFile) 81 Expect(err).ShouldNot(HaveOccurred()) 82 job.ID = helper.StringToPtr(uuid.Generate()) 83 resp, _, err := jobs.Register(job, nil) 84 Expect(err).ShouldNot(HaveOccurred()) 85 Expect(resp.EvalID).ShouldNot(BeEmpty()) 86 87 }) 88 89 AfterEach(func() { 90 //Deregister job 91 jobs.Deregister(*job.ID, true, nil) 92 system.GarbageCollect() 93 }) 94 95 Describe("Reschedule Stanza Tests", func() { 96 97 Context("No reschedule attempts", func() { 98 BeforeEach(func() { 99 specFile = "input/norescheduling.hcl" 100 }) 101 102 It("Should have exactly three allocs and all failed", func() { 103 Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"})) 104 }) 105 }) 106 107 Context("System jobs should never be rescheduled", func() { 108 BeforeEach(func() { 109 specFile = "input/rescheduling_system.hcl" 110 }) 111 112 It("Should have exactly one failed alloc", func() { 113 Eventually(allocStatuses, 10*time.Second, time.Second).Should(ConsistOf([]string{"failed"})) 114 }) 115 }) 116 117 Context("Default Rescheduling", func() { 118 BeforeEach(func() { 119 specFile = "input/rescheduling_default.hcl" 120 }) 121 It("Should have exactly three allocs and all failed after 5 secs", func() { 122 Eventually(allocStatuses, 5*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed"})) 123 }) 124 // wait until first exponential delay kicks in and rescheduling is attempted 125 It("Should have exactly six allocs and all failed after 35 secs", func() { 126 if !*slow { 127 Skip("Skipping slow test") 128 } 129 Eventually(allocStatuses, 35*time.Second, time.Second).Should(ConsistOf([]string{"failed", "failed", "failed", "failed", "failed", "failed"})) 130 }) 131 }) 132 133 Context("Reschedule attempts maxed out", func() { 134 BeforeEach(func() { 135 specFile = "input/rescheduling_fail.hcl" 136 }) 137 It("Should have all failed", func() { 138 Eventually(allocStatuses, 6*time.Second, time.Second).ShouldNot( 139 SatisfyAll(ContainElement("pending"), 140 ContainElement("running"))) 141 }) 142 Context("Updating job to change its version", func() { 143 It("Should have running allocs now", func() { 144 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} 145 _, _, err := jobs.Register(job, nil) 146 Expect(err).ShouldNot(HaveOccurred()) 147 Eventually(allocStatuses, 5*time.Second, time.Second).Should(ContainElement("running")) 148 }) 149 }) 150 }) 151 152 Context("Reschedule attempts succeeded", func() { 153 BeforeEach(func() { 154 specFile = "input/reschedule_success.hcl" 155 }) 156 It("Should have some running allocs", func() { 157 Eventually(allocStatuses, 6*time.Second, time.Second).Should( 158 ContainElement("running")) 159 }) 160 }) 161 162 Context("Reschedule with update stanza", func() { 163 BeforeEach(func() { 164 specFile = "input/rescheduling_update.hcl" 165 }) 166 It("Should have all running allocs", func() { 167 Eventually(allocStatuses, 3*time.Second, time.Second).Should( 168 ConsistOf([]string{"running", "running", "running"})) 169 }) 170 Context("Updating job to make allocs fail", func() { 171 It("Should have rescheduled allocs until progress deadline", func() { 172 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 173 _, _, err := jobs.Register(job, nil) 174 Expect(err).ShouldNot(HaveOccurred()) 175 Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty()) 176 }) 177 }) 178 179 }) 180 181 Context("Reschedule with canary", func() { 182 BeforeEach(func() { 183 specFile = "input/rescheduling_canary.hcl" 184 }) 185 It("Should have running allocs and successful deployment", func() { 186 Eventually(allocStatuses, 3*time.Second, time.Second).Should( 187 ConsistOf([]string{"running", "running", "running"})) 188 189 time.Sleep(2 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs 190 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 191 ContainElement(structs.DeploymentStatusSuccessful)) 192 }) 193 194 Context("Updating job to make allocs fail", func() { 195 It("Should have rescheduled allocs until progress deadline", func() { 196 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 197 _, _, err := jobs.Register(job, nil) 198 Expect(err).ShouldNot(HaveOccurred()) 199 Eventually(allocStatusesRescheduled, 5*time.Second, time.Second).ShouldNot(BeEmpty()) 200 201 // Verify new deployment and its status 202 // Deployment status should be running (because of progress deadline) 203 time.Sleep(3 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs 204 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 205 ContainElement(structs.DeploymentStatusRunning)) 206 }) 207 }) 208 209 }) 210 211 Context("Reschedule with canary, auto revert with short progress deadline ", func() { 212 BeforeEach(func() { 213 specFile = "input/rescheduling_canary_autorevert.hcl" 214 }) 215 It("Should have running allocs and successful deployment", func() { 216 Eventually(allocStatuses, 3*time.Second, time.Second).Should( 217 ConsistOf([]string{"running", "running", "running"})) 218 219 time.Sleep(2 * time.Second) 220 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 221 ContainElement(structs.DeploymentStatusSuccessful)) 222 223 // Make an update that causes the job to fail 224 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 225 _, _, err := jobs.Register(job, nil) 226 Expect(err).ShouldNot(HaveOccurred()) 227 Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty()) 228 229 // Wait for the revert 230 Eventually(allocStatuses, 3*time.Second, time.Second).Should( 231 ConsistOf([]string{"failed", "failed", "failed", "running", "running", "running"})) 232 // Verify new deployment and its status 233 // There should be one successful, one failed, and one more successful (after revert) 234 time.Sleep(5 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs 235 Eventually(deploymentStatus(), 5*time.Second, time.Second).Should( 236 ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful)) 237 }) 238 239 }) 240 241 Context("Reschedule with max parallel/auto_revert false", func() { 242 BeforeEach(func() { 243 specFile = "input/rescheduling_maxp.hcl" 244 }) 245 It("Should have running allocs and successful deployment", func() { 246 Eventually(allocStatuses, 3*time.Second, time.Second).Should( 247 ConsistOf([]string{"running", "running", "running"})) 248 249 time.Sleep(2 * time.Second) 250 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 251 ContainElement(structs.DeploymentStatusSuccessful)) 252 }) 253 254 Context("Updating job to make allocs fail", func() { 255 It("Should have rescheduled allocs till progress deadline", func() { 256 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 257 _, _, err := jobs.Register(job, nil) 258 Expect(err).ShouldNot(HaveOccurred()) 259 Eventually(allocStatusesRescheduled, 6*time.Second, time.Second).ShouldNot(BeEmpty()) 260 261 // Should have failed allocs including rescheduled failed allocs 262 Eventually(allocStatuses, 6*time.Second, time.Second).Should( 263 ConsistOf([]string{"complete", "failed", "failed", "running", "running"})) 264 265 // Verify new deployment and its status 266 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 267 ContainElement(structs.DeploymentStatusRunning)) 268 }) 269 }) 270 271 }) 272 273 Context("Reschedule with max parallel, auto revert true and short progress deadline", func() { 274 BeforeEach(func() { 275 specFile = "input/rescheduling_maxp_autorevert.hcl" 276 }) 277 It("Should have running allocs and successful deployment", func() { 278 Eventually(allocStatuses, 3*time.Second, time.Second).Should( 279 ConsistOf([]string{"running", "running", "running"})) 280 281 time.Sleep(4 * time.Second) 282 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 283 ContainElement(structs.DeploymentStatusSuccessful)) 284 285 // Make an update that causes the job to fail 286 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 287 _, _, err := jobs.Register(job, nil) 288 Expect(err).ShouldNot(HaveOccurred()) 289 Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty()) 290 291 // Wait for the revert 292 Eventually(allocStatuses, 5*time.Second, time.Second).Should( 293 ConsistOf([]string{"complete", "failed", "running", "running", "running"})) 294 295 // Verify new deployment and its status 296 // There should be one successful, one failed, and one more successful (after revert) 297 time.Sleep(5 * time.Second) 298 Eventually(deploymentStatus(), 2*time.Second, time.Second).Should( 299 ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful)) 300 }) 301 302 }) 303 304 Context("Reschedule with progress deadline", func() { 305 BeforeEach(func() { 306 specFile = "input/rescheduling_progressdeadline.hcl" 307 }) 308 It("Should have running allocs and successful deployment", func() { 309 if !*slow { 310 Skip("Skipping slow test") 311 } 312 // Deployment should succeed eventually 313 time.Sleep(20 * time.Second) 314 Eventually(deploymentStatus(), 5*time.Second, time.Second).Should( 315 ContainElement(structs.DeploymentStatusSuccessful)) 316 317 }) 318 319 }) 320 321 }) 322 323 })