github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/e2e/nodedrain/nodedrain.go (about)

     1  package nodedrain
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"strings"
     7  	"time"
     8  
     9  	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
    10  	"github.com/hashicorp/nomad/e2e/framework"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/testutil"
    13  )
    14  
    15  const ns = ""
    16  
    17  type NodeDrainE2ETest struct {
    18  	framework.TC
    19  	jobIDs  []string
    20  	nodeIDs []string
    21  }
    22  
    23  func init() {
    24  	framework.AddSuites(&framework.TestSuite{
    25  		Component:   "NodeDrain",
    26  		CanRunLocal: true,
    27  		Consul:      true,
    28  		Cases: []framework.TestCase{
    29  			new(NodeDrainE2ETest),
    30  		},
    31  	})
    32  
    33  }
    34  
    35  func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) {
    36  	e2e.WaitForLeader(f.T(), tc.Nomad())
    37  	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration
    38  }
    39  
    40  func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) {
    41  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    42  		return
    43  	}
    44  
    45  	for _, id := range tc.jobIDs {
    46  		_, err := e2e.Command("nomad", "job", "stop", "-purge", id)
    47  		f.Assert().NoError(err)
    48  	}
    49  	tc.jobIDs = []string{}
    50  
    51  	for _, id := range tc.nodeIDs {
    52  		_, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id)
    53  		f.Assert().NoError(err)
    54  	}
    55  	tc.nodeIDs = []string{}
    56  
    57  	_, err := e2e.Command("nomad", "system", "gc")
    58  	f.Assert().NoError(err)
    59  }
    60  
    61  func nodesForJob(jobID string) ([]string, error) {
    62  	allocs, err := e2e.AllocsForJob(jobID, ns)
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  	if len(allocs) < 1 {
    67  		return nil, fmt.Errorf("no allocs found for job: %v", jobID)
    68  	}
    69  	nodes := []string{}
    70  	for _, alloc := range allocs {
    71  		nodes = append(nodes, alloc["Node ID"])
    72  	}
    73  	return nodes, nil
    74  }
    75  
    76  // waitForNodeDrain is a convenience wrapper that polls 'node status'
    77  // until the comparison function over the state of the job's allocs on that
    78  // node returns true
    79  func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error {
    80  	var got []map[string]string
    81  	var err error
    82  	interval, retries := wc.OrDefault()
    83  	testutil.WaitForResultRetries(retries, func() (bool, error) {
    84  		time.Sleep(interval)
    85  		got, err = e2e.AllocsForNode(nodeID)
    86  		if err != nil {
    87  			return false, err
    88  		}
    89  		return comparison(got), nil
    90  	}, func(e error) {
    91  		err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got)
    92  	})
    93  	return err
    94  }
    95  
    96  // TestNodeDrainEphemeralMigrate tests that ephermeral_disk migrations work as
    97  // expected even during a node drain.
    98  func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
    99  	jobID := "test-node-drain-" + uuid.Generate()[0:8]
   100  	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_migrate.nomad"))
   101  	tc.jobIDs = append(tc.jobIDs, jobID)
   102  
   103  	expected := []string{"running"}
   104  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   105  
   106  	allocs, err := e2e.AllocsForJob(jobID, ns)
   107  	f.NoError(err, "could not get allocs for job")
   108  	f.Len(allocs, 1, "could not get allocs for job")
   109  	oldAllocID := allocs[0]["ID"]
   110  
   111  	nodes, err := nodesForJob(jobID)
   112  	f.NoError(err, "could not get nodes for job")
   113  	f.Len(nodes, 1, "could not get nodes for job")
   114  	nodeID := nodes[0]
   115  
   116  	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
   117  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   118  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   119  
   120  	f.NoError(waitForNodeDrain(nodeID,
   121  		func(got []map[string]string) bool {
   122  			for _, alloc := range got {
   123  				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
   124  					return true
   125  				}
   126  			}
   127  			return false
   128  		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
   129  	), "node did not drain")
   130  
   131  	// wait for the allocation to be migrated
   132  	expected = []string{"running", "complete"}
   133  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   134  
   135  	allocs, err = e2e.AllocsForJob(jobID, ns)
   136  	f.NoError(err, "could not get allocations for job")
   137  
   138  	// the task writes its alloc ID to a file if it hasn't been previously
   139  	// written, so find the contents of the migrated file and make sure they
   140  	// match the old allocation, not the running one
   141  	var got string
   142  	var fsErr error
   143  	testutil.WaitForResultRetries(500, func() (bool, error) {
   144  		time.Sleep(time.Millisecond * 100)
   145  		for _, alloc := range allocs {
   146  			if alloc["Status"] == "running" && alloc["Node ID"] != nodeID && alloc["ID"] != oldAllocID {
   147  				got, fsErr = e2e.Command("nomad", "alloc", "fs",
   148  					alloc["ID"], fmt.Sprintf("alloc/data/%s", jobID))
   149  				if err != nil {
   150  					return false, err
   151  				}
   152  				if strings.TrimSpace(got) == oldAllocID {
   153  					return true, nil
   154  				} else {
   155  					return false, fmt.Errorf("expected %q, got %q", oldAllocID, got)
   156  				}
   157  			}
   158  		}
   159  		return false, fmt.Errorf("did not find a migrated alloc")
   160  	}, func(e error) {
   161  		fsErr = e
   162  	})
   163  	f.NoError(fsErr, "node drained but migration failed")
   164  }
   165  
   166  // TestNodeDrainIgnoreSystem tests that system jobs are left behind when the
   167  // -ignore-system flag is used.
   168  func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) {
   169  
   170  	nodes, err := e2e.NodeStatusListFiltered(
   171  		func(section string) bool {
   172  			kernelName, err := e2e.GetField(section, "kernel.name")
   173  			return err == nil && kernelName == "linux"
   174  		})
   175  	f.NoError(err, "could not get node status listing")
   176  
   177  	serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8]
   178  	systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8]
   179  
   180  	f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad"))
   181  	tc.jobIDs = append(tc.jobIDs, serviceJobID)
   182  
   183  	allocs, err := e2e.AllocsForJob(serviceJobID, ns)
   184  	f.NoError(err, "could not get allocs for service job")
   185  	f.Len(allocs, 1, "could not get allocs for service job")
   186  	oldAllocID := allocs[0]["ID"]
   187  
   188  	f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad"))
   189  	tc.jobIDs = append(tc.jobIDs, systemJobID)
   190  
   191  	expected := []string{"running"}
   192  	f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, expected),
   193  		"service job should be running")
   194  
   195  	// can't just give it a static list because the number of nodes can vary
   196  	f.NoError(
   197  		e2e.WaitForAllocStatusComparison(
   198  			func() ([]string, error) { return e2e.AllocStatuses(systemJobID, ns) },
   199  			func(got []string) bool {
   200  				if len(got) != len(nodes) {
   201  					return false
   202  				}
   203  				for _, status := range got {
   204  					if status != "running" {
   205  						return false
   206  					}
   207  				}
   208  				return true
   209  			}, nil,
   210  		),
   211  		"system job should be running on every node",
   212  	)
   213  
   214  	jobNodes, err := nodesForJob(serviceJobID)
   215  	f.NoError(err, "could not get nodes for job")
   216  	f.Len(jobNodes, 1, "could not get nodes for job")
   217  	nodeID := jobNodes[0]
   218  
   219  	out, err := e2e.Command(
   220  		"nomad", "node", "drain",
   221  		"-ignore-system", "-enable", "-yes", "-detach", nodeID)
   222  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   223  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   224  
   225  	f.NoError(waitForNodeDrain(nodeID,
   226  		func(got []map[string]string) bool {
   227  			for _, alloc := range got {
   228  				if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
   229  					return true
   230  				}
   231  			}
   232  			return false
   233  		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
   234  	), "node did not drain")
   235  
   236  	allocs, err = e2e.AllocsForJob(systemJobID, ns)
   237  	f.NoError(err, "could not query allocs for system job")
   238  	f.Equal(len(nodes), len(allocs), "system job should still be running on every node")
   239  	for _, alloc := range allocs {
   240  		f.Equal("run", alloc["Desired"], "no system allocs should be draining")
   241  		f.Equal("running", alloc["Status"], "no system allocs should be draining")
   242  	}
   243  }
   244  
   245  // TestNodeDrainDeadline tests the enforcement of the node drain deadline so
   246  // that allocations are terminated even if they haven't gracefully exited.
   247  func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
   248  	f.T().Skip("The behavior is unclear and test assertions don't capture intent.  Issue 9902")
   249  
   250  	jobID := "test-node-drain-" + uuid.Generate()[0:8]
   251  	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
   252  	tc.jobIDs = append(tc.jobIDs, jobID)
   253  
   254  	expected := []string{"running"}
   255  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   256  
   257  	nodes, err := nodesForJob(jobID)
   258  	f.NoError(err, "could not get nodes for job")
   259  	f.Len(nodes, 1, "could not get nodes for job")
   260  	nodeID := nodes[0]
   261  
   262  	f.T().Logf("draining node %v", nodeID)
   263  	out, err := e2e.Command(
   264  		"nomad", "node", "drain",
   265  		"-deadline", "5s",
   266  		"-enable", "-yes", "-detach", nodeID)
   267  	f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out))
   268  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   269  
   270  	// the deadline is 40s but we can't guarantee its instantly terminated at
   271  	// that point, so we give it 30s which is well under the 2m kill_timeout in
   272  	// the job.
   273  	// deadline here needs to account for scheduling and propagation delays.
   274  	f.NoError(waitForNodeDrain(nodeID,
   275  		func(got []map[string]string) bool {
   276  			// FIXME: check the drain job alloc specifically. test
   277  			// may pass if client had another completed alloc
   278  			for _, alloc := range got {
   279  				if alloc["Status"] == "complete" {
   280  					return true
   281  				}
   282  			}
   283  			return false
   284  		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
   285  	), "node did not drain immediately following deadline")
   286  }
   287  
   288  // TestNodeDrainDeadline tests the enforcement of the node drain -force flag
   289  // so that allocations are terminated immediately.
   290  func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
   291  	f.T().Skip("The behavior is unclear and test assertions don't capture intent.  Issue 9902")
   292  
   293  	jobID := "test-node-drain-" + uuid.Generate()[0:8]
   294  	f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad"))
   295  	tc.jobIDs = append(tc.jobIDs, jobID)
   296  
   297  	expected := []string{"running"}
   298  	f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running")
   299  
   300  	nodes, err := nodesForJob(jobID)
   301  	f.NoError(err, "could not get nodes for job")
   302  	f.Len(nodes, 1, "could not get nodes for job")
   303  	nodeID := nodes[0]
   304  
   305  	out, err := e2e.Command(
   306  		"nomad", "node", "drain",
   307  		"-force",
   308  		"-enable", "-yes", "-detach", nodeID)
   309  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   310  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   311  
   312  	// we've passed -force but we can't guarantee its instantly terminated at
   313  	// that point, so we give it 30s which is under the 2m kill_timeout in
   314  	// the job
   315  	f.NoError(waitForNodeDrain(nodeID,
   316  		func(got []map[string]string) bool {
   317  			// FIXME: check the drain job alloc specifically. test
   318  			// may pass if client had another completed alloc
   319  			for _, alloc := range got {
   320  				if alloc["Status"] == "complete" {
   321  					return true
   322  				}
   323  			}
   324  			return false
   325  		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
   326  	), "node did not drain immediately when forced")
   327  
   328  }
   329  
   330  // TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for
   331  // scheduling after disabling drain.
   332  func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) {
   333  
   334  	nodes, err := e2e.NodeStatusList()
   335  	f.NoError(err, "could not get node status listing")
   336  
   337  	nodeID := nodes[0]["ID"]
   338  
   339  	out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
   340  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   341  	tc.nodeIDs = append(tc.nodeIDs, nodeID)
   342  
   343  	_, err = e2e.Command(
   344  		"nomad", "node", "drain",
   345  		"-disable", "-keep-ineligible", "-yes", nodeID)
   346  	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
   347  
   348  	nodes, err = e2e.NodeStatusList()
   349  	f.NoError(err, "could not get updated node status listing")
   350  
   351  	f.Equal("ineligible", nodes[0]["Eligibility"])
   352  	f.Equal("false", nodes[0]["Drain"])
   353  }