github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/e2e/nodedrain/nodedrain.go (about) 1 package nodedrain 2 3 import ( 4 "fmt" 5 "os" 6 "strings" 7 "time" 8 9 e2e "github.com/hashicorp/nomad/e2e/e2eutil" 10 "github.com/hashicorp/nomad/e2e/framework" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/testutil" 13 ) 14 15 const ns = "" 16 17 type NodeDrainE2ETest struct { 18 framework.TC 19 jobIDs []string 20 nodeIDs []string 21 } 22 23 func init() { 24 framework.AddSuites(&framework.TestSuite{ 25 Component: "NodeDrain", 26 CanRunLocal: true, 27 Consul: true, 28 Cases: []framework.TestCase{ 29 new(NodeDrainE2ETest), 30 }, 31 }) 32 33 } 34 35 func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) { 36 e2e.WaitForLeader(f.T(), tc.Nomad()) 37 e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration 38 } 39 40 func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) { 41 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 42 return 43 } 44 45 for _, id := range tc.jobIDs { 46 _, err := e2e.Command("nomad", "job", "stop", "-purge", id) 47 f.Assert().NoError(err) 48 } 49 tc.jobIDs = []string{} 50 51 for _, id := range tc.nodeIDs { 52 _, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id) 53 f.Assert().NoError(err) 54 } 55 tc.nodeIDs = []string{} 56 57 _, err := e2e.Command("nomad", "system", "gc") 58 f.Assert().NoError(err) 59 } 60 61 func nodesForJob(jobID string) ([]string, error) { 62 allocs, err := e2e.AllocsForJob(jobID, ns) 63 if err != nil { 64 return nil, err 65 } 66 if len(allocs) < 1 { 67 return nil, fmt.Errorf("no allocs found for job: %v", jobID) 68 } 69 nodes := []string{} 70 for _, alloc := range allocs { 71 nodes = append(nodes, alloc["Node ID"]) 72 } 73 return nodes, nil 74 } 75 76 // waitForNodeDrain is a convenience wrapper that polls 'node status' 77 // until the comparison function over the state of the job's allocs on that 78 // node returns true 79 func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error { 80 var got []map[string]string 81 var err error 82 interval, retries := wc.OrDefault() 83 testutil.WaitForResultRetries(retries, func() (bool, error) { 84 time.Sleep(interval) 85 got, err = e2e.AllocsForNode(nodeID) 86 if err != nil { 87 return false, err 88 } 89 return comparison(got), nil 90 }, func(e error) { 91 err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got) 92 }) 93 return err 94 } 95 96 // TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as 97 // expected even during a node drain. 98 func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) { 99 jobID := "test-node-drain-" + uuid.Generate()[0:8] 100 f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad")) 101 tc.jobIDs = append(tc.jobIDs, jobID) 102 103 expected := []string{"running"} 104 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 105 106 allocs, err := e2e.AllocsForJob(jobID, ns) 107 f.NoError(err, "could not get allocs for job") 108 f.Len(allocs, 1, "could not get allocs for job") 109 oldAllocID := allocs[0]["ID"] 110 111 nodes, err := nodesForJob(jobID) 112 f.NoError(err, "could not get nodes for job") 113 f.Len(nodes, 1, "could not get nodes for job") 114 nodeID := nodes[0] 115 116 out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID) 117 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 118 tc.nodeIDs = append(tc.nodeIDs, nodeID) 119 120 f.NoError(waitForNodeDrain(nodeID, 121 func(got []map[string]string) bool { 122 for _, alloc := range got { 123 if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" { 124 return true 125 } 126 } 127 return false 128 }, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500}, 129 ), "node did not drain") 130 131 // wait for the allocation to be migrated 132 expected = []string{"running", "complete"} 133 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 134 135 allocs, err = e2e.AllocsForJob(jobID, ns) 136 f.NoError(err, "could not get allocations for job") 137 138 // the task writes its alloc ID to a file if it hasn't been previously 139 // written, so find the contents of the migrated file and make sure they 140 // match the old allocation, not the running one 141 var got string 142 var fsErr error 143 testutil.WaitForResultRetries(500, func() (bool, error) { 144 time.Sleep(time.Millisecond * 100) 145 for _, alloc := range allocs { 146 if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID { 147 got, fsErr = e2e.Command("nomad", "alloc", "fs", 148 alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID)) 149 if err != nil { 150 return false, err 151 } 152 if strings.TrimSpace(got) == oldAllocID { 153 return true, nil 154 } else { 155 return false, fmt.Errorf("expected %q, got %q", oldAllocID, got) 156 } 157 } 158 } 159 return false, fmt.Errorf("did not find a migrated alloc") 160 }, func(e error) { 161 fsErr = e 162 }) 163 f.NoError(fsErr, "node drained but migration failed") 164 } 165 166 // TestNodeDrainIgnoreSystem tests that system jobs are left behind when the 167 // -ignore-system flag is used. 168 func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) { 169 170 nodes, err := e2e.NodeStatusListFiltered( 171 func(section string) bool { 172 kernelName, err := e2e.GetField(section, "kernel.name") 173 return err == nil && kernelName == "linux" 174 }) 175 f.NoError(err, "could not get node status listing") 176 177 serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8] 178 systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8] 179 180 f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad")) 181 tc.jobIDs = append(tc.jobIDs, serviceJobID) 182 183 allocs, err := e2e.AllocsForJob(serviceJobID, ns) 184 f.NoError(err, "could not get allocs for service job") 185 f.Len(allocs, 1, "could not get allocs for service job") 186 oldAllocID := allocs[0]["ID"] 187 188 f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad")) 189 tc.jobIDs = append(tc.jobIDs, systemJobID) 190 191 expected := []string{"running"} 192 f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, expected), 193 "service job should be running") 194 195 // can't just give it a static list because the number of nodes can vary 196 f.NoError( 197 e2e.WaitForAllocStatusComparison( 198 func() ([]string, error) { return e2e.AllocStatuses(systemJobID, ns) }, 199 func(got []string) bool { 200 if len(got) != len(nodes) { 201 return false 202 } 203 for _, status := range got { 204 if status != "running" { 205 return false 206 } 207 } 208 return true 209 }, nil, 210 ), 211 "system job should be running on every node", 212 ) 213 214 jobNodes, err := nodesForJob(serviceJobID) 215 f.NoError(err, "could not get nodes for job") 216 f.Len(jobNodes, 1, "could not get nodes for job") 217 nodeID := jobNodes[0] 218 219 out, err := e2e.Command( 220 "nomad", "node", "drain", 221 "-ignore-system", "-enable", "-yes", "-detach", nodeID) 222 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 223 tc.nodeIDs = append(tc.nodeIDs, nodeID) 224 225 f.NoError(waitForNodeDrain(nodeID, 226 func(got []map[string]string) bool { 227 for _, alloc := range got { 228 if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" { 229 return true 230 } 231 } 232 return false 233 }, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500}, 234 ), "node did not drain") 235 236 allocs, err = e2e.AllocsForJob(systemJobID, ns) 237 f.NoError(err, "could not query allocs for system job") 238 f.Equal(len(nodes), len(allocs), "system job should still be running on every node") 239 for _, alloc := range allocs { 240 f.Equal("run", alloc["Desired"], "no system allocs should be draining") 241 f.Equal("running", alloc["Status"], "no system allocs should be draining") 242 } 243 } 244 245 // TestNodeDrainDeadline tests the enforcement of the node drain deadline so 246 // that allocations are terminated even if they haven't gracefully exited. 247 func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) { 248 f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902") 249 250 jobID := "test-node-drain-" + uuid.Generate()[0:8] 251 f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad")) 252 tc.jobIDs = append(tc.jobIDs, jobID) 253 254 expected := []string{"running"} 255 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 256 257 nodes, err := nodesForJob(jobID) 258 f.NoError(err, "could not get nodes for job") 259 f.Len(nodes, 1, "could not get nodes for job") 260 nodeID := nodes[0] 261 262 f.T().Logf("draining node %v", nodeID) 263 out, err := e2e.Command( 264 "nomad", "node", "drain", 265 "-deadline", "5s", 266 "-enable", "-yes", "-detach", nodeID) 267 f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out)) 268 tc.nodeIDs = append(tc.nodeIDs, nodeID) 269 270 // the deadline is 40s but we can't guarantee its instantly terminated at 271 // that point, so we give it 30s which is well under the 2m kill_timeout in 272 // the job. 273 // deadline here needs to account for scheduling and propagation delays. 274 f.NoError(waitForNodeDrain(nodeID, 275 func(got []map[string]string) bool { 276 // FIXME: check the drain job alloc specifically. test 277 // may pass if client had another completed alloc 278 for _, alloc := range got { 279 if alloc["Status"] == "complete" { 280 return true 281 } 282 } 283 return false 284 }, &e2e.WaitConfig{Interval: time.Second, Retries: 40}, 285 ), "node did not drain immediately following deadline") 286 } 287 288 // TestNodeDrainDeadline tests the enforcement of the node drain -force flag 289 // so that allocations are terminated immediately. 290 func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) { 291 f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902") 292 293 jobID := "test-node-drain-" + uuid.Generate()[0:8] 294 f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad")) 295 tc.jobIDs = append(tc.jobIDs, jobID) 296 297 expected := []string{"running"} 298 f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") 299 300 nodes, err := nodesForJob(jobID) 301 f.NoError(err, "could not get nodes for job") 302 f.Len(nodes, 1, "could not get nodes for job") 303 nodeID := nodes[0] 304 305 out, err := e2e.Command( 306 "nomad", "node", "drain", 307 "-force", 308 "-enable", "-yes", "-detach", nodeID) 309 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 310 tc.nodeIDs = append(tc.nodeIDs, nodeID) 311 312 // we've passed -force but we can't guarantee its instantly terminated at 313 // that point, so we give it 30s which is under the 2m kill_timeout in 314 // the job 315 f.NoError(waitForNodeDrain(nodeID, 316 func(got []map[string]string) bool { 317 // FIXME: check the drain job alloc specifically. test 318 // may pass if client had another completed alloc 319 for _, alloc := range got { 320 if alloc["Status"] == "complete" { 321 return true 322 } 323 } 324 return false 325 }, &e2e.WaitConfig{Interval: time.Second, Retries: 40}, 326 ), "node did not drain immediately when forced") 327 328 } 329 330 // TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for 331 // scheduling after disabling drain. 332 func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) { 333 334 nodes, err := e2e.NodeStatusList() 335 f.NoError(err, "could not get node status listing") 336 337 nodeID := nodes[0]["ID"] 338 339 out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID) 340 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 341 tc.nodeIDs = append(tc.nodeIDs, nodeID) 342 343 _, err = e2e.Command( 344 "nomad", "node", "drain", 345 "-disable", "-keep-ineligible", "-yes", nodeID) 346 f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) 347 348 nodes, err = e2e.NodeStatusList() 349 f.NoError(err, "could not get updated node status listing") 350 351 f.Equal("ineligible", nodes[0]["Eligibility"]) 352 f.Equal("false", nodes[0]["Drain"]) 353 }