github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/nodedrain/nodedrain.go (about)

     1  package nodedrain
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"strings"
     7  	"time"
     8  
     9  	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
    10  	"github.com/hashicorp/nomad/e2e/framework"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/testutil"
    13  )
    14  
    15  const ns = ""
    16  
    17  type NodeDrainE2ETest struct {
    18  	framework.TC
    19  	jobIDs  []string
    20  	nodeIDs []string
    21  }
    22  
    23  func init() {
    24  	framework.AddSuites(&framework.TestSuite{
    25  		Component:   "NodeDrain",
    26  		CanRunLocal: true,
    27  		Consul:      true,
    28  		Cases: []framework.TestCase{
    29  			new(NodeDrainE2ETest),
    30  		},
    31  	})
    32  
    33  }
    34  
    35  func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) {
    36  	e2e.WaitForLeader(f.T(), tc.Nomad())
    37  	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration
    38  }
    39  
    40  func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) {
    41  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    42  		return
    43  	}
    44  
    45  	for _, id := range tc.jobIDs {
    46  		_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
    47  		f.Assert().NoError(err)
    48  	}
    49  	tc.jobIDs = []string{}
    50  
    51  	for _, id := range tc.nodeIDs {
    52  		_, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id)
    53  		f.Assert().NoError(err)
    54  		_, err = e2e.Command("nomad", "node", "eligibility", "-enable", id)
    55  		f.Assert().NoError(err)
    56  	}
    57  	tc.nodeIDs = []string{}
    58  
    59  	_, err := e2e.Command("nomad", "system", "gc")
    60  	f.Assert().NoError(err)
    61  }
    62  
    63  func nodesForJob(jobID string) ([]string, error) {
    64  	allocs, err := e2e.AllocsForJob(jobID, ns)
    65  	if err != nil {
    66  		return nil, err
    67  	}
    68  	if len(allocs) < 1 {
    69  		return nil, fmt.Errorf("no allocs found for job: %v", jobID)
    70  	}
    71  	nodes := []string{}
    72  	for _, alloc := range allocs {
    73  		nodes = append(nodes, alloc["Node ID"])
    74  	}
    75  	return nodes, nil
    76  }
    77  
    78  // waitForNodeDrain is a convenience wrapper that polls 'node status'
    79  // until the comparison function over the state of the job's allocs on that
    80  // node returns true
    81  func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error {
    82  	var got []map[string]string
    83  	var err error
    84  	interval, retries := wc.OrDefault()
    85  	testutil.WaitForResultRetries(retries, func() (bool, error) {
    86  		time.Sleep(interval)
    87  		got, err = e2e.AllocsForNode(nodeID)
    88  		if err != nil {
    89  			return false, err
    90  		}
    91  		return comparison(got), nil
    92  	}, func(e error) {
    93  		err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got)
    94  	})
    95  	return err
    96  }
    97  
    98  // TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as
    99  // expected even during a node drain.
   100  func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
   101  	jobID := "test-node-drain-" + uuid.Generate()[0:8]
   102  	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad"))
   103  	tc.jobIDs = append(tc.jobIDs, jobID)
   104  
   105  	expected := []string{"running"}
   106  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   107  
   108  	allocs, err := e2e.AllocsForJob(jobID, ns)
   109  	f.NoError(err, "could not get allocs for job")
   110  	f.Len(allocs, 1, "could not get allocs for job")
   111  	oldAllocID := allocs[0]["ID"]
   112  
   113  	nodes, err := nodesForJob(jobID)
   114  	f.NoError(err, "could not get nodes for job")
   115  	f.Len(nodes, 1, "could not get nodes for job")
   116  	nodeID := nodes[0]
   117  
   118  	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
   119  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   120  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   121  
   122  	f.NoError(waitForNodeDrain(nodeID,
   123  		func(got []map[string]string) bool {
   124  			for _, alloc := range got {
   125  				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
   126  					return true
   127  				}
   128  			}
   129  			return false
   130  		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
   131  	), "node did not drain")
   132  
   133  	// wait for the allocation to be migrated
   134  	expected = []string{"running", "complete"}
   135  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   136  
   137  	allocs, err = e2e.AllocsForJob(jobID, ns)
   138  	f.NoError(err, "could not get allocations for job")
   139  
   140  	// the task writes its alloc ID to a file if it hasn't been previously
   141  	// written, so find the contents of the migrated file and make sure they
   142  	// match the old allocation, not the running one
   143  	var got string
   144  	var fsErr error
   145  	testutil.WaitForResultRetries(10, func() (bool, error) {
   146  		time.Sleep(time.Millisecond * 100)
   147  		for _, alloc := range allocs {
   148  			if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID {
   149  				got, fsErr = e2e.Command("nomad", "alloc", "fs",
   150  					alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID))
   151  				if err != nil {
   152  					return false, err
   153  				}
   154  				return true, nil
   155  			}
   156  		}
   157  		return false, fmt.Errorf("missing expected allocation")
   158  	}, func(e error) {
   159  		fsErr = e
   160  	})
   161  	f.NoError(fsErr, "could not get allocation data")
   162  	f.Equal(oldAllocID, strings.TrimSpace(got), "node drained but migration failed")
   163  }
   164  
   165  // TestNodeDrainIgnoreSystem tests that system jobs are left behind when the
   166  // -ignore-system flag is used.
   167  func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) {
   168  
   169  	nodes, err := e2e.NodeStatusListFiltered(
   170  		func(section string) bool {
   171  			kernelName, err := e2e.GetField(section, "kernel.name")
   172  			return err == nil && kernelName == "linux"
   173  		})
   174  	f.NoError(err, "could not get node status listing")
   175  
   176  	serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8]
   177  	systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8]
   178  
   179  	f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad"))
   180  	tc.jobIDs = append(tc.jobIDs, serviceJobID)
   181  
   182  	f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, []string{"running"}))
   183  
   184  	allocs, err := e2e.AllocsForJob(serviceJobID, ns)
   185  	f.NoError(err, "could not get allocs for service job")
   186  	f.Len(allocs, 1, "could not get allocs for service job")
   187  	oldAllocID := allocs[0]["ID"]
   188  
   189  	f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad"))
   190  	tc.jobIDs = append(tc.jobIDs, systemJobID)
   191  
   192  	expected := []string{"running"}
   193  	f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, expected),
   194  		"service job should be running")
   195  
   196  	// can't just give it a static list because the number of nodes can vary
   197  	f.NoError(
   198  		e2e.WaitForAllocStatusComparison(
   199  			func() ([]string, error) { return e2e.AllocStatuses(systemJobID, ns) },
   200  			func(got []string) bool {
   201  				if len(got) != len(nodes) {
   202  					return false
   203  				}
   204  				for _, status := range got {
   205  					if status != "running" {
   206  						return false
   207  					}
   208  				}
   209  				return true
   210  			}, nil,
   211  		),
   212  		"system job should be running on every node",
   213  	)
   214  
   215  	jobNodes, err := nodesForJob(serviceJobID)
   216  	f.NoError(err, "could not get nodes for job")
   217  	f.Len(jobNodes, 1, "could not get nodes for job")
   218  	nodeID := jobNodes[0]
   219  
   220  	out, err := e2e.Command(
   221  		"nomad", "node", "drain",
   222  		"-ignore-system", "-enable", "-yes", "-detach", nodeID)
   223  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   224  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   225  
   226  	f.NoError(waitForNodeDrain(nodeID,
   227  		func(got []map[string]string) bool {
   228  			for _, alloc := range got {
   229  				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
   230  					return true
   231  				}
   232  			}
   233  			return false
   234  		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
   235  	), "node did not drain")
   236  
   237  	allocs, err = e2e.AllocsForJob(systemJobID, ns)
   238  	f.NoError(err, "could not query allocs for system job")
   239  	f.Equal(len(nodes), len(allocs), "system job should still be running on every node")
   240  	for _, alloc := range allocs {
   241  		f.Equal("run", alloc["Desired"], "no system allocs should be draining")
   242  		f.Equal("running", alloc["Status"], "no system allocs should be draining")
   243  	}
   244  }
   245  
   246  // TestNodeDrainDeadline tests the enforcement of the node drain deadline so
   247  // that allocations are terminated even if they haven't gracefully exited.
   248  func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
   249  	f.T().Skip("The behavior is unclear and test assertions don't capture intent.  Issue 9902")
   250  
   251  	jobID := "test-node-drain-" + uuid.Generate()[0:8]
   252  	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
   253  	tc.jobIDs = append(tc.jobIDs, jobID)
   254  
   255  	expected := []string{"running"}
   256  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   257  
   258  	nodes, err := nodesForJob(jobID)
   259  	f.NoError(err, "could not get nodes for job")
   260  	f.Len(nodes, 1, "could not get nodes for job")
   261  	nodeID := nodes[0]
   262  
   263  	f.T().Logf("draining node %v", nodeID)
   264  	out, err := e2e.Command(
   265  		"nomad", "node", "drain",
   266  		"-deadline", "5s",
   267  		"-enable", "-yes", "-detach", nodeID)
   268  	f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out))
   269  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   270  
   271  	// the deadline is 40s but we can't guarantee its instantly terminated at
   272  	// that point, so we give it 30s which is well under the 2m kill_timeout in
   273  	// the job.
   274  	// deadline here needs to account for scheduling and propagation delays.
   275  	f.NoError(waitForNodeDrain(nodeID,
   276  		func(got []map[string]string) bool {
   277  			// FIXME: check the drain job alloc specifically. test
   278  			// may pass if client had another completed alloc
   279  			for _, alloc := range got {
   280  				if alloc["Status"] == "complete" {
   281  					return true
   282  				}
   283  			}
   284  			return false
   285  		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
   286  	), "node did not drain immediately following deadline")
   287  }
   288  
   289  // TestNodeDrainForce tests the enforcement of the node drain -force flag so
   290  // that allocations are terminated immediately.
   291  func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
   292  	f.T().Skip("The behavior is unclear and test assertions don't capture intent.  Issue 9902")
   293  
   294  	jobID := "test-node-drain-" + uuid.Generate()[0:8]
   295  	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
   296  	tc.jobIDs = append(tc.jobIDs, jobID)
   297  
   298  	expected := []string{"running"}
   299  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   300  
   301  	nodes, err := nodesForJob(jobID)
   302  	f.NoError(err, "could not get nodes for job")
   303  	f.Len(nodes, 1, "could not get nodes for job")
   304  	nodeID := nodes[0]
   305  
   306  	out, err := e2e.Command(
   307  		"nomad", "node", "drain",
   308  		"-force",
   309  		"-enable", "-yes", "-detach", nodeID)
   310  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   311  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   312  
   313  	// we've passed -force but we can't guarantee its instantly terminated at
   314  	// that point, so we give it 30s which is under the 2m kill_timeout in
   315  	// the job
   316  	f.NoError(waitForNodeDrain(nodeID,
   317  		func(got []map[string]string) bool {
   318  			// FIXME: check the drain job alloc specifically. test
   319  			// may pass if client had another completed alloc
   320  			for _, alloc := range got {
   321  				if alloc["Status"] == "complete" {
   322  					return true
   323  				}
   324  			}
   325  			return false
   326  		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
   327  	), "node did not drain immediately when forced")
   328  
   329  }
   330  
   331  // TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for
   332  // scheduling after disabling drain.
   333  func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) {
   334  
   335  	nodes, err := e2e.NodeStatusList()
   336  	f.NoError(err, "could not get node status listing")
   337  
   338  	nodeID := nodes[0]["ID"]
   339  
   340  	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
   341  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   342  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   343  
   344  	_, err = e2e.Command(
   345  		"nomad", "node", "drain",
   346  		"-disable", "-keep-ineligible", "-yes", nodeID)
   347  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   348  
   349  	nodes, err = e2e.NodeStatusList()
   350  	f.NoError(err, "could not get updated node status listing")
   351  
   352  	f.Equal("ineligible", nodes[0]["Eligibility"])
   353  	f.Equal("false", nodes[0]["Drain"])
   354  }