github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/nodedrain/nodedrain.go (about) 1 package nodedrain 2 3 import ( 4 "fmt" 5 "os" 6 "strings" 7 "time" 8 9 e2e "github.com/hashicorp/nomad/e2e/e2eutil" 10 "github.com/hashicorp/nomad/e2e/framework" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/testutil" 13 ) 14 15 const ns = "" 16 17 type NodeDrainE2ETest struct { 18 framework.TC 19 jobIDs []string 20 nodeIDs []string 21 } 22 23 func init() { 24 framework.AddSuites(&framework.TestSuite{ 25 Component: "NodeDrain", 26 CanRunLocal: true, 27 Consul: true, 28 Cases: []framework.TestCase{ 29 new(NodeDrainE2ETest), 30 }, 31 }) 32 33 } 34 35 func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) { 36 e2e.WaitForLeader(f.T(), tc.Nomad()) 37 e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration 38 } 39 40 func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) { 41 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 42 return 43 } 44 45 for _, id := range tc.jobIDs { 46 _, err := e2e.Command("nomad", "job", "stop", "-purge", id) 47 f.Assert().NoError(err) 48 } 49 tc.jobIDs = []string{} 50 51 for _, id := range tc.nodeIDs { 52 _, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id) 53 f.Assert().NoError(err) 54 _, err = e2e.Command("nomad", "node", "eligibility", "-enable", id) 55 f.Assert().NoError(err) 56 } 57 tc.nodeIDs = []string{} 58 59 _, err := e2e.Command("nomad", "system", "gc") 60 f.Assert().NoError(err) 61 } 62 63 func nodesForJob(jobID string) ([]string, error) { 64 allocs, err := e2e.AllocsForJob(jobID, ns) 65 if err != nil { 66 return nil, err 67 } 68 if len(allocs) < 1 { 69 return nil, fmt.Errorf("no allocs found for job: %v", jobID) 70 } 71 nodes := []string{} 72 for _, alloc := range allocs { 73 nodes = append(nodes, alloc["Node ID"]) 74 } 75 return nodes, nil 76 } 77 78 // waitForNodeDrain is a convenience wrapper that polls 'node status' 79 // until the comparison function over the state of the job's allocs on that 80 // node returns true 81 func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error { 82 var got []map[string]string 83 var err error 84 interval, retries := wc.OrDefault() 85 testutil.WaitForResultRetries(retries, func() (bool, error) { 86 time.Sleep(interval) 87 got, err = e2e.AllocsForNode(nodeID) 88 if err != nil { 89 return false, err 90 } 91 return comparison(got), nil 92 }, func(e error) { 93 err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got) 94 }) 95 return err 96 } 97 98 // TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as 99 // expected even during a node drain. 100 func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) { 101 jobID := "test-node-drain-" + uuid.Generate()[0:8] 102 f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad")) 103 tc.jobIDs = append(tc.jobIDs, jobID) 104 105 expected := []string{"running"} 106 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 107 108 allocs, err := e2e.AllocsForJob(jobID, ns) 109 f.NoError(err, "could not get allocs for job") 110 f.Len(allocs, 1, "could not get allocs for job") 111 oldAllocID := allocs[0]["ID"] 112 113 nodes, err := nodesForJob(jobID) 114 f.NoError(err, "could not get nodes for job") 115 f.Len(nodes, 1, "could not get nodes for job") 116 nodeID := nodes[0] 117 118 out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID) 119 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 120 tc.nodeIDs = append(tc.nodeIDs, nodeID) 121 122 f.NoError(waitForNodeDrain(nodeID, 123 func(got []map[string]string) bool { 124 for _, alloc := range got { 125 if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" { 126 return true 127 } 128 } 129 return false 130 }, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500}, 131 ), "node did not drain") 132 133 // wait for the allocation to be migrated 134 expected = []string{"running", "complete"} 135 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 136 137 allocs, err = e2e.AllocsForJob(jobID, ns) 138 f.NoError(err, "could not get allocations for job") 139 140 // the task writes its alloc ID to a file if it hasn't been previously 141 // written, so find the contents of the migrated file and make sure they 142 // match the old allocation, not the running one 143 var got string 144 var fsErr error 145 testutil.WaitForResultRetries(10, func() (bool, error) { 146 time.Sleep(time.Millisecond * 100) 147 for _, alloc := range allocs { 148 if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID { 149 got, fsErr = e2e.Command("nomad", "alloc", "fs", 150 alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID)) 151 if err != nil { 152 return false, err 153 } 154 return true, nil 155 } 156 } 157 return false, fmt.Errorf("missing expected allocation") 158 }, func(e error) { 159 fsErr = e 160 }) 161 f.NoError(fsErr, "could not get allocation data") 162 f.Equal(oldAllocID, strings.TrimSpace(got), "node drained but migration failed") 163 } 164 165 // TestNodeDrainIgnoreSystem tests that system jobs are left behind when the 166 // -ignore-system flag is used. 167 func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) { 168 169 nodes, err := e2e.NodeStatusListFiltered( 170 func(section string) bool { 171 kernelName, err := e2e.GetField(section, "kernel.name") 172 return err == nil && kernelName == "linux" 173 }) 174 f.NoError(err, "could not get node status listing") 175 176 serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8] 177 systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8] 178 179 f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad")) 180 tc.jobIDs = append(tc.jobIDs, serviceJobID) 181 182 f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, []string{"running"})) 183 184 allocs, err := e2e.AllocsForJob(serviceJobID, ns) 185 f.NoError(err, "could not get allocs for service job") 186 f.Len(allocs, 1, "could not get allocs for service job") 187 oldAllocID := allocs[0]["ID"] 188 189 f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad")) 190 tc.jobIDs = append(tc.jobIDs, systemJobID) 191 192 expected := []string{"running"} 193 f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, expected), 194 "service job should be running") 195 196 // can't just give it a static list because the number of nodes can vary 197 f.NoError( 198 e2e.WaitForAllocStatusComparison( 199 func() ([]string, error) { return e2e.AllocStatuses(systemJobID, ns) }, 200 func(got []string) bool { 201 if len(got) != len(nodes) { 202 return false 203 } 204 for _, status := range got { 205 if status != "running" { 206 return false 207 } 208 } 209 return true 210 }, nil, 211 ), 212 "system job should be running on every node", 213 ) 214 215 jobNodes, err := nodesForJob(serviceJobID) 216 f.NoError(err, "could not get nodes for job") 217 f.Len(jobNodes, 1, "could not get nodes for job") 218 nodeID := jobNodes[0] 219 220 out, err := e2e.Command( 221 "nomad", "node", "drain", 222 "-ignore-system", "-enable", "-yes", "-detach", nodeID) 223 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 224 tc.nodeIDs = append(tc.nodeIDs, nodeID) 225 226 f.NoError(waitForNodeDrain(nodeID, 227 func(got []map[string]string) bool { 228 for _, alloc := range got { 229 if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" { 230 return true 231 } 232 } 233 return false 234 }, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500}, 235 ), "node did not drain") 236 237 allocs, err = e2e.AllocsForJob(systemJobID, ns) 238 f.NoError(err, "could not query allocs for system job") 239 f.Equal(len(nodes), len(allocs), "system job should still be running on every node") 240 for _, alloc := range allocs { 241 f.Equal("run", alloc["Desired"], "no system allocs should be draining") 242 f.Equal("running", alloc["Status"], "no system allocs should be draining") 243 } 244 } 245 246 // TestNodeDrainDeadline tests the enforcement of the node drain deadline so 247 // that allocations are terminated even if they haven't gracefully exited. 248 func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) { 249 f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902") 250 251 jobID := "test-node-drain-" + uuid.Generate()[0:8] 252 f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad")) 253 tc.jobIDs = append(tc.jobIDs, jobID) 254 255 expected := []string{"running"} 256 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 257 258 nodes, err := nodesForJob(jobID) 259 f.NoError(err, "could not get nodes for job") 260 f.Len(nodes, 1, "could not get nodes for job") 261 nodeID := nodes[0] 262 263 f.T().Logf("draining node %v", nodeID) 264 out, err := e2e.Command( 265 "nomad", "node", "drain", 266 "-deadline", "5s", 267 "-enable", "-yes", "-detach", nodeID) 268 f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out)) 269 tc.nodeIDs = append(tc.nodeIDs, nodeID) 270 271 // the deadline is 40s but we can't guarantee its instantly terminated at 272 // that point, so we give it 30s which is well under the 2m kill_timeout in 273 // the job. 274 // deadline here needs to account for scheduling and propagation delays. 275 f.NoError(waitForNodeDrain(nodeID, 276 func(got []map[string]string) bool { 277 // FIXME: check the drain job alloc specifically. test 278 // may pass if client had another completed alloc 279 for _, alloc := range got { 280 if alloc["Status"] == "complete" { 281 return true 282 } 283 } 284 return false 285 }, &e2e.WaitConfig{Interval: time.Second, Retries: 40}, 286 ), "node did not drain immediately following deadline") 287 } 288 289 // TestNodeDrainForce tests the enforcement of the node drain -force flag so 290 // that allocations are terminated immediately. 291 func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) { 292 f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902") 293 294 jobID := "test-node-drain-" + uuid.Generate()[0:8] 295 f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad")) 296 tc.jobIDs = append(tc.jobIDs, jobID) 297 298 expected := []string{"running"} 299 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 300 301 nodes, err := nodesForJob(jobID) 302 f.NoError(err, "could not get nodes for job") 303 f.Len(nodes, 1, "could not get nodes for job") 304 nodeID := nodes[0] 305 306 out, err := e2e.Command( 307 "nomad", "node", "drain", 308 "-force", 309 "-enable", "-yes", "-detach", nodeID) 310 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 311 tc.nodeIDs = append(tc.nodeIDs, nodeID) 312 313 // we've passed -force but we can't guarantee its instantly terminated at 314 // that point, so we give it 30s which is under the 2m kill_timeout in 315 // the job 316 f.NoError(waitForNodeDrain(nodeID, 317 func(got []map[string]string) bool { 318 // FIXME: check the drain job alloc specifically. test 319 // may pass if client had another completed alloc 320 for _, alloc := range got { 321 if alloc["Status"] == "complete" { 322 return true 323 } 324 } 325 return false 326 }, &e2e.WaitConfig{Interval: time.Second, Retries: 40}, 327 ), "node did not drain immediately when forced") 328 329 } 330 331 // TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for 332 // scheduling after disabling drain. 333 func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) { 334 335 nodes, err := e2e.NodeStatusList() 336 f.NoError(err, "could not get node status listing") 337 338 nodeID := nodes[0]["ID"] 339 340 out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID) 341 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 342 tc.nodeIDs = append(tc.nodeIDs, nodeID) 343 344 _, err = e2e.Command( 345 "nomad", "node", "drain", 346 "-disable", "-keep-ineligible", "-yes", nodeID) 347 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 348 349 nodes, err = e2e.NodeStatusList() 350 f.NoError(err, "could not get updated node status listing") 351 352 f.Equal("ineligible", nodes[0]["Eligibility"]) 353 f.Equal("false", nodes[0]["Drain"]) 354 }