github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/drainer_int_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net/rpc"
     7  	"testing"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
    13  
    14  	"github.com/hashicorp/nomad/helper"
    15  	"github.com/hashicorp/nomad/helper/uuid"
    16  	"github.com/hashicorp/nomad/nomad/drainer"
    17  	"github.com/hashicorp/nomad/nomad/mock"
    18  	"github.com/hashicorp/nomad/nomad/state"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  	"github.com/hashicorp/nomad/testutil"
    21  	"github.com/stretchr/testify/require"
    22  )
    23  
    24  func allocPromoter(errCh chan<- error, ctx context.Context,
    25  	state *state.StateStore, codec rpc.ClientCodec, nodeID string,
    26  	logger log.Logger) {
    27  
    28  	nindex := uint64(1)
    29  	for {
    30  		allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex)
    31  		if err != nil {
    32  			if err == context.Canceled {
    33  				return
    34  			}
    35  
    36  			errCh <- fmt.Errorf("failed to get node allocs: %v", err)
    37  			return
    38  		}
    39  		nindex = index
    40  
    41  		// For each alloc that doesn't have its deployment status set, set it
    42  		var updates []*structs.Allocation
    43  		now := time.Now()
    44  		for _, alloc := range allocs {
    45  			if alloc.Job.Type != structs.JobTypeService {
    46  				continue
    47  			}
    48  
    49  			if alloc.DeploymentStatus.HasHealth() {
    50  				continue
    51  			}
    52  			newAlloc := alloc.Copy()
    53  			newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{
    54  				Healthy:   helper.BoolToPtr(true),
    55  				Timestamp: now,
    56  			}
    57  			updates = append(updates, newAlloc)
    58  			logger.Trace("marked deployment health for alloc", "alloc_id", alloc.ID)
    59  		}
    60  
    61  		if len(updates) == 0 {
    62  			continue
    63  		}
    64  
    65  		// Send the update
    66  		req := &structs.AllocUpdateRequest{
    67  			Alloc:        updates,
    68  			WriteRequest: structs.WriteRequest{Region: "global"},
    69  		}
    70  		var resp structs.GenericResponse
    71  		if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil {
    72  			if ctx.Err() == context.Canceled {
    73  				return
    74  			} else if err != nil {
    75  				errCh <- err
    76  			}
    77  		}
    78  	}
    79  }
    80  
    81  // checkAllocPromoter is a small helper to return an error or nil from an error
    82  // chan like the one given to the allocPromoter goroutine.
    83  func checkAllocPromoter(errCh chan error) error {
    84  	select {
    85  	case err := <-errCh:
    86  		return err
    87  	default:
    88  		return nil
    89  	}
    90  }
    91  
    92  func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) {
    93  	resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx)
    94  	if err != nil {
    95  		return nil, 0, err
    96  	}
    97  	if err := ctx.Err(); err != nil {
    98  		return nil, 0, err
    99  	}
   100  
   101  	return resp.([]*structs.Allocation), index, nil
   102  }
   103  
   104  func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   105  	return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   106  		// Capture all the allocations
   107  		allocs, err := state.AllocsByNode(ws, nodeID)
   108  		if err != nil {
   109  			return nil, 0, err
   110  		}
   111  
   112  		// Use the last index that affected the jobs table
   113  		index, err := state.Index("allocs")
   114  		if err != nil {
   115  			return nil, index, err
   116  		}
   117  
   118  		return allocs, index, nil
   119  	}
   120  }
   121  
   122  func TestDrainer_Simple_ServiceOnly(t *testing.T) {
   123  	t.Parallel()
   124  	require := require.New(t)
   125  	s1 := TestServer(t, nil)
   126  	defer s1.Shutdown()
   127  	codec := rpcClient(t, s1)
   128  	testutil.WaitForLeader(t, s1.RPC)
   129  
   130  	// Create two nodes
   131  	n1, n2 := mock.Node(), mock.Node()
   132  	nodeReg := &structs.NodeRegisterRequest{
   133  		Node:         n1,
   134  		WriteRequest: structs.WriteRequest{Region: "global"},
   135  	}
   136  	var nodeResp structs.NodeUpdateResponse
   137  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   138  
   139  	// Create a job that runs on just one
   140  	job := mock.Job()
   141  	job.TaskGroups[0].Count = 2
   142  	req := &structs.JobRegisterRequest{
   143  		Job: job,
   144  		WriteRequest: structs.WriteRequest{
   145  			Region:    "global",
   146  			Namespace: job.Namespace,
   147  		},
   148  	}
   149  
   150  	// Fetch the response
   151  	var resp structs.JobRegisterResponse
   152  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   153  	require.NotZero(resp.Index)
   154  
   155  	// Wait for the two allocations to be placed
   156  	state := s1.State()
   157  	testutil.WaitForResult(func() (bool, error) {
   158  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   159  		if err != nil {
   160  			return false, err
   161  		}
   162  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   163  	}, func(err error) {
   164  		t.Fatalf("err: %v", err)
   165  	})
   166  
   167  	// Create the second node
   168  	nodeReg = &structs.NodeRegisterRequest{
   169  		Node:         n2,
   170  		WriteRequest: structs.WriteRequest{Region: "global"},
   171  	}
   172  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   173  
   174  	// Drain the first node
   175  	drainReq := &structs.NodeUpdateDrainRequest{
   176  		NodeID: n1.ID,
   177  		DrainStrategy: &structs.DrainStrategy{
   178  			DrainSpec: structs.DrainSpec{
   179  				Deadline: 10 * time.Minute,
   180  			},
   181  		},
   182  		WriteRequest: structs.WriteRequest{Region: "global"},
   183  	}
   184  	var drainResp structs.NodeDrainUpdateResponse
   185  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   186  
   187  	// Wait for the allocs to be replaced
   188  	errCh := make(chan error, 2)
   189  	ctx, cancel := context.WithCancel(context.Background())
   190  	defer cancel()
   191  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   192  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   193  
   194  	testutil.WaitForResult(func() (bool, error) {
   195  		allocs, err := state.AllocsByNode(nil, n2.ID)
   196  		if err != nil {
   197  			return false, err
   198  		}
   199  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   200  	}, func(err error) {
   201  		t.Fatalf("err: %v", err)
   202  	})
   203  
   204  	// Check that the node drain is removed
   205  	testutil.WaitForResult(func() (bool, error) {
   206  		if err := checkAllocPromoter(errCh); err != nil {
   207  			return false, err
   208  		}
   209  		node, err := state.NodeByID(nil, n1.ID)
   210  		if err != nil {
   211  			return false, err
   212  		}
   213  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   214  	}, func(err error) {
   215  		t.Fatalf("err: %v", err)
   216  	})
   217  
   218  	// Check we got the right events
   219  	node, err := state.NodeByID(nil, n1.ID)
   220  	require.NoError(err)
   221  	require.Len(node.Events, 3)
   222  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   223  }
   224  
   225  func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) {
   226  	t.Parallel()
   227  	require := require.New(t)
   228  	s1 := TestServer(t, nil)
   229  	defer s1.Shutdown()
   230  	codec := rpcClient(t, s1)
   231  	testutil.WaitForLeader(t, s1.RPC)
   232  
   233  	// Create a node
   234  	n1 := mock.Node()
   235  	nodeReg := &structs.NodeRegisterRequest{
   236  		Node:         n1,
   237  		WriteRequest: structs.WriteRequest{Region: "global"},
   238  	}
   239  	var nodeResp structs.NodeUpdateResponse
   240  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   241  
   242  	// Create a job that runs on just one
   243  	job := mock.Job()
   244  	job.Update = *structs.DefaultUpdateStrategy
   245  	job.Update.Stagger = 30 * time.Second
   246  	job.TaskGroups[0].Count = 2
   247  	req := &structs.JobRegisterRequest{
   248  		Job: job,
   249  		WriteRequest: structs.WriteRequest{
   250  			Region:    "global",
   251  			Namespace: job.Namespace,
   252  		},
   253  	}
   254  
   255  	// Fetch the response
   256  	var resp structs.JobRegisterResponse
   257  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   258  	require.NotZero(resp.Index)
   259  
   260  	// Wait for the two allocations to be placed
   261  	state := s1.State()
   262  	testutil.WaitForResult(func() (bool, error) {
   263  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   264  		if err != nil {
   265  			return false, err
   266  		}
   267  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   268  	}, func(err error) {
   269  		t.Fatalf("err: %v", err)
   270  	})
   271  
   272  	// Drain the node
   273  	drainReq := &structs.NodeUpdateDrainRequest{
   274  		NodeID: n1.ID,
   275  		DrainStrategy: &structs.DrainStrategy{
   276  			DrainSpec: structs.DrainSpec{
   277  				Deadline: 1 * time.Second,
   278  			},
   279  		},
   280  		WriteRequest: structs.WriteRequest{Region: "global"},
   281  	}
   282  	var drainResp structs.NodeDrainUpdateResponse
   283  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   284  
   285  	// Wait for the allocs to be stopped
   286  	testutil.WaitForResult(func() (bool, error) {
   287  		allocs, err := state.AllocsByNode(nil, n1.ID)
   288  		if err != nil {
   289  			return false, err
   290  		}
   291  		for _, alloc := range allocs {
   292  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   293  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   294  			}
   295  		}
   296  		return true, nil
   297  	}, func(err error) {
   298  		t.Fatalf("err: %v", err)
   299  	})
   300  
   301  	// Check that the node drain is removed
   302  	testutil.WaitForResult(func() (bool, error) {
   303  		node, err := state.NodeByID(nil, n1.ID)
   304  		if err != nil {
   305  			return false, err
   306  		}
   307  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   308  	}, func(err error) {
   309  		t.Fatalf("err: %v", err)
   310  	})
   311  
   312  	// Check we got the right events
   313  	node, err := state.NodeByID(nil, n1.ID)
   314  	require.NoError(err)
   315  	require.Len(node.Events, 3)
   316  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   317  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   318  }
   319  
   320  func TestDrainer_DrainEmptyNode(t *testing.T) {
   321  	t.Parallel()
   322  	require := require.New(t)
   323  	s1 := TestServer(t, nil)
   324  	defer s1.Shutdown()
   325  	codec := rpcClient(t, s1)
   326  	testutil.WaitForLeader(t, s1.RPC)
   327  
   328  	// Create a node
   329  	n1 := mock.Node()
   330  	nodeReg := &structs.NodeRegisterRequest{
   331  		Node:         n1,
   332  		WriteRequest: structs.WriteRequest{Region: "global"},
   333  	}
   334  	var nodeResp structs.NodeUpdateResponse
   335  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   336  
   337  	// Drain the node
   338  	drainReq := &structs.NodeUpdateDrainRequest{
   339  		NodeID: n1.ID,
   340  		DrainStrategy: &structs.DrainStrategy{
   341  			DrainSpec: structs.DrainSpec{
   342  				Deadline: 10 * time.Minute,
   343  			},
   344  		},
   345  		WriteRequest: structs.WriteRequest{Region: "global"},
   346  	}
   347  	var drainResp structs.NodeDrainUpdateResponse
   348  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   349  
   350  	// Check that the node drain is removed
   351  	state := s1.State()
   352  	testutil.WaitForResult(func() (bool, error) {
   353  		node, err := state.NodeByID(nil, n1.ID)
   354  		if err != nil {
   355  			return false, err
   356  		}
   357  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   358  	}, func(err error) {
   359  		t.Fatalf("err: %v", err)
   360  	})
   361  
   362  	// Check we got the right events
   363  	node, err := state.NodeByID(nil, n1.ID)
   364  	require.NoError(err)
   365  	require.Len(node.Events, 3)
   366  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   367  }
   368  
   369  func TestDrainer_AllTypes_Deadline(t *testing.T) {
   370  	t.Parallel()
   371  	require := require.New(t)
   372  	s1 := TestServer(t, nil)
   373  	defer s1.Shutdown()
   374  	codec := rpcClient(t, s1)
   375  	testutil.WaitForLeader(t, s1.RPC)
   376  
   377  	// Create two nodes, registering the second later
   378  	n1, n2 := mock.Node(), mock.Node()
   379  	nodeReg := &structs.NodeRegisterRequest{
   380  		Node:         n1,
   381  		WriteRequest: structs.WriteRequest{Region: "global"},
   382  	}
   383  	var nodeResp structs.NodeUpdateResponse
   384  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   385  
   386  	// Create a service job that runs on just one
   387  	job := mock.Job()
   388  	job.TaskGroups[0].Count = 2
   389  	req := &structs.JobRegisterRequest{
   390  		Job: job,
   391  		WriteRequest: structs.WriteRequest{
   392  			Region:    "global",
   393  			Namespace: job.Namespace,
   394  		},
   395  	}
   396  
   397  	// Fetch the response
   398  	var resp structs.JobRegisterResponse
   399  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   400  	require.NotZero(resp.Index)
   401  
   402  	// Create a system job
   403  	sysjob := mock.SystemJob()
   404  	req = &structs.JobRegisterRequest{
   405  		Job: sysjob,
   406  		WriteRequest: structs.WriteRequest{
   407  			Region:    "global",
   408  			Namespace: job.Namespace,
   409  		},
   410  	}
   411  
   412  	// Fetch the response
   413  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   414  	require.NotZero(resp.Index)
   415  
   416  	// Create a batch job
   417  	bjob := mock.BatchJob()
   418  	bjob.TaskGroups[0].Count = 2
   419  	req = &structs.JobRegisterRequest{
   420  		Job: bjob,
   421  		WriteRequest: structs.WriteRequest{
   422  			Region:    "global",
   423  			Namespace: job.Namespace,
   424  		},
   425  	}
   426  
   427  	// Fetch the response
   428  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   429  	require.NotZero(resp.Index)
   430  
   431  	// Wait for the allocations to be placed
   432  	state := s1.State()
   433  	testutil.WaitForResult(func() (bool, error) {
   434  		allocs, err := state.AllocsByNode(nil, n1.ID)
   435  		if err != nil {
   436  			return false, err
   437  		}
   438  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   439  	}, func(err error) {
   440  		t.Fatalf("err: %v", err)
   441  	})
   442  
   443  	// Create the second node
   444  	nodeReg = &structs.NodeRegisterRequest{
   445  		Node:         n2,
   446  		WriteRequest: structs.WriteRequest{Region: "global"},
   447  	}
   448  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   449  
   450  	// Drain the node
   451  	drainReq := &structs.NodeUpdateDrainRequest{
   452  		NodeID: n1.ID,
   453  		DrainStrategy: &structs.DrainStrategy{
   454  			DrainSpec: structs.DrainSpec{
   455  				Deadline: 2 * time.Second,
   456  			},
   457  		},
   458  		WriteRequest: structs.WriteRequest{Region: "global"},
   459  	}
   460  	var drainResp structs.NodeDrainUpdateResponse
   461  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   462  
   463  	// Wait for the allocs to be replaced
   464  	errCh := make(chan error, 2)
   465  	ctx, cancel := context.WithCancel(context.Background())
   466  	defer cancel()
   467  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   468  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   469  
   470  	// Wait for the allocs to be stopped
   471  	var finalAllocs []*structs.Allocation
   472  	testutil.WaitForResult(func() (bool, error) {
   473  		if err := checkAllocPromoter(errCh); err != nil {
   474  			return false, err
   475  		}
   476  
   477  		var err error
   478  		finalAllocs, err = state.AllocsByNode(nil, n1.ID)
   479  		if err != nil {
   480  			return false, err
   481  		}
   482  		for _, alloc := range finalAllocs {
   483  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   484  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   485  			}
   486  		}
   487  		return true, nil
   488  	}, func(err error) {
   489  		t.Fatalf("err: %v", err)
   490  	})
   491  
   492  	// Check that the node drain is removed
   493  	testutil.WaitForResult(func() (bool, error) {
   494  		node, err := state.NodeByID(nil, n1.ID)
   495  		if err != nil {
   496  			return false, err
   497  		}
   498  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   499  	}, func(err error) {
   500  		t.Fatalf("err: %v", err)
   501  	})
   502  
   503  	// Wait for the allocations to be placed on the other node
   504  	testutil.WaitForResult(func() (bool, error) {
   505  		allocs, err := state.AllocsByNode(nil, n2.ID)
   506  		if err != nil {
   507  			return false, err
   508  		}
   509  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   510  	}, func(err error) {
   511  		t.Fatalf("err: %v", err)
   512  	})
   513  
   514  	// Assert that the service finished before the batch and system
   515  	var serviceMax, batchMax uint64 = 0, 0
   516  	for _, alloc := range finalAllocs {
   517  		if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax {
   518  			serviceMax = alloc.ModifyIndex
   519  		} else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax {
   520  			batchMax = alloc.ModifyIndex
   521  		}
   522  	}
   523  	require.True(serviceMax < batchMax)
   524  
   525  	// Check we got the right events
   526  	node, err := state.NodeByID(nil, n1.ID)
   527  	require.NoError(err)
   528  	require.Len(node.Events, 3)
   529  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   530  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   531  }
   532  
   533  // Test that drain is unset when batch jobs naturally finish
   534  func TestDrainer_AllTypes_NoDeadline(t *testing.T) {
   535  	t.Parallel()
   536  	require := require.New(t)
   537  	s1 := TestServer(t, nil)
   538  	defer s1.Shutdown()
   539  	codec := rpcClient(t, s1)
   540  	testutil.WaitForLeader(t, s1.RPC)
   541  
   542  	// Create two nodes, registering the second later
   543  	n1, n2 := mock.Node(), mock.Node()
   544  	nodeReg := &structs.NodeRegisterRequest{
   545  		Node:         n1,
   546  		WriteRequest: structs.WriteRequest{Region: "global"},
   547  	}
   548  	var nodeResp structs.NodeUpdateResponse
   549  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   550  
   551  	// Create a service job that runs on just one
   552  	job := mock.Job()
   553  	job.TaskGroups[0].Count = 2
   554  	req := &structs.JobRegisterRequest{
   555  		Job: job,
   556  		WriteRequest: structs.WriteRequest{
   557  			Region:    "global",
   558  			Namespace: job.Namespace,
   559  		},
   560  	}
   561  
   562  	// Fetch the response
   563  	var resp structs.JobRegisterResponse
   564  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   565  	require.NotZero(resp.Index)
   566  
   567  	// Create a system job
   568  	sysjob := mock.SystemJob()
   569  	req = &structs.JobRegisterRequest{
   570  		Job: sysjob,
   571  		WriteRequest: structs.WriteRequest{
   572  			Region:    "global",
   573  			Namespace: job.Namespace,
   574  		},
   575  	}
   576  
   577  	// Fetch the response
   578  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   579  	require.NotZero(resp.Index)
   580  
   581  	// Create a batch job
   582  	bjob := mock.BatchJob()
   583  	bjob.TaskGroups[0].Count = 2
   584  	req = &structs.JobRegisterRequest{
   585  		Job: bjob,
   586  		WriteRequest: structs.WriteRequest{
   587  			Region:    "global",
   588  			Namespace: job.Namespace,
   589  		},
   590  	}
   591  
   592  	// Fetch the response
   593  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   594  	require.NotZero(resp.Index)
   595  
   596  	// Wait for the allocations to be placed
   597  	state := s1.State()
   598  	testutil.WaitForResult(func() (bool, error) {
   599  		allocs, err := state.AllocsByNode(nil, n1.ID)
   600  		if err != nil {
   601  			return false, err
   602  		}
   603  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   604  	}, func(err error) {
   605  		t.Fatalf("err: %v", err)
   606  	})
   607  
   608  	// Create the second node
   609  	nodeReg = &structs.NodeRegisterRequest{
   610  		Node:         n2,
   611  		WriteRequest: structs.WriteRequest{Region: "global"},
   612  	}
   613  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   614  
   615  	// Drain the node
   616  	drainReq := &structs.NodeUpdateDrainRequest{
   617  		NodeID: n1.ID,
   618  		DrainStrategy: &structs.DrainStrategy{
   619  			DrainSpec: structs.DrainSpec{
   620  				Deadline: 0 * time.Second, // Infinite
   621  			},
   622  		},
   623  		WriteRequest: structs.WriteRequest{Region: "global"},
   624  	}
   625  	var drainResp structs.NodeDrainUpdateResponse
   626  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   627  
   628  	// Wait for the allocs to be replaced
   629  	errCh := make(chan error, 2)
   630  	ctx, cancel := context.WithCancel(context.Background())
   631  	defer cancel()
   632  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   633  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   634  
   635  	// Wait for the service allocs to be stopped on the draining node
   636  	testutil.WaitForResult(func() (bool, error) {
   637  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   638  		if err != nil {
   639  			return false, err
   640  		}
   641  		for _, alloc := range allocs {
   642  			if alloc.NodeID != n1.ID {
   643  				continue
   644  			}
   645  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   646  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   647  			}
   648  		}
   649  		if err := checkAllocPromoter(errCh); err != nil {
   650  			return false, err
   651  		}
   652  		return true, nil
   653  	}, func(err error) {
   654  		t.Fatalf("err: %v", err)
   655  	})
   656  
   657  	// Mark the batch allocations as finished
   658  	allocs, err := state.AllocsByJob(nil, job.Namespace, bjob.ID, false)
   659  	require.Nil(err)
   660  
   661  	var updates []*structs.Allocation
   662  	for _, alloc := range allocs {
   663  		new := alloc.Copy()
   664  		new.ClientStatus = structs.AllocClientStatusComplete
   665  		updates = append(updates, new)
   666  	}
   667  	require.Nil(state.UpdateAllocsFromClient(1000, updates))
   668  
   669  	// Check that the node drain is removed
   670  	testutil.WaitForResult(func() (bool, error) {
   671  		node, err := state.NodeByID(nil, n1.ID)
   672  		if err != nil {
   673  			return false, err
   674  		}
   675  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   676  	}, func(err error) {
   677  		t.Fatalf("err: %v", err)
   678  	})
   679  
   680  	// Wait for the service allocations to be placed on the other node
   681  	testutil.WaitForResult(func() (bool, error) {
   682  		allocs, err := state.AllocsByNode(nil, n2.ID)
   683  		if err != nil {
   684  			return false, err
   685  		}
   686  		return len(allocs) == 3, fmt.Errorf("got %d allocs", len(allocs))
   687  	}, func(err error) {
   688  		t.Fatalf("err: %v", err)
   689  	})
   690  
   691  	// Check we got the right events
   692  	node, err := state.NodeByID(nil, n1.ID)
   693  	require.NoError(err)
   694  	require.Len(node.Events, 3)
   695  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   696  }
   697  
   698  func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) {
   699  	t.Parallel()
   700  	require := require.New(t)
   701  	s1 := TestServer(t, nil)
   702  	defer s1.Shutdown()
   703  	codec := rpcClient(t, s1)
   704  	testutil.WaitForLeader(t, s1.RPC)
   705  
   706  	// Create two nodes, registering the second later
   707  	n1, n2 := mock.Node(), mock.Node()
   708  	nodeReg := &structs.NodeRegisterRequest{
   709  		Node:         n1,
   710  		WriteRequest: structs.WriteRequest{Region: "global"},
   711  	}
   712  	var nodeResp structs.NodeUpdateResponse
   713  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   714  
   715  	// Create a service job that runs on just one
   716  	job := mock.Job()
   717  	job.TaskGroups[0].Count = 2
   718  	req := &structs.JobRegisterRequest{
   719  		Job: job,
   720  		WriteRequest: structs.WriteRequest{
   721  			Region:    "global",
   722  			Namespace: job.Namespace,
   723  		},
   724  	}
   725  
   726  	// Fetch the response
   727  	var resp structs.JobRegisterResponse
   728  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   729  	require.NotZero(resp.Index)
   730  	job.CreateIndex = resp.JobModifyIndex
   731  
   732  	// Create a system job
   733  	sysjob := mock.SystemJob()
   734  	req = &structs.JobRegisterRequest{
   735  		Job: sysjob,
   736  		WriteRequest: structs.WriteRequest{
   737  			Region:    "global",
   738  			Namespace: job.Namespace,
   739  		},
   740  	}
   741  
   742  	// Fetch the response
   743  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   744  	require.NotZero(resp.Index)
   745  	sysjob.CreateIndex = resp.JobModifyIndex
   746  
   747  	// Create a batch job
   748  	bjob := mock.BatchJob()
   749  	bjob.TaskGroups[0].Count = 2
   750  	req = &structs.JobRegisterRequest{
   751  		Job: bjob,
   752  		WriteRequest: structs.WriteRequest{
   753  			Region:    "global",
   754  			Namespace: job.Namespace,
   755  		},
   756  	}
   757  
   758  	// Fetch the response
   759  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   760  	require.NotZero(resp.Index)
   761  	bjob.CreateIndex = resp.JobModifyIndex
   762  
   763  	// Wait for the allocations to be placed
   764  	state := s1.State()
   765  	testutil.WaitForResult(func() (bool, error) {
   766  		allocs, err := state.AllocsByNode(nil, n1.ID)
   767  		if err != nil {
   768  			return false, err
   769  		}
   770  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   771  	}, func(err error) {
   772  		t.Fatalf("err: %v", err)
   773  	})
   774  
   775  	// Create some old terminal allocs for each job that point at a non-existent
   776  	// node to simulate it being on a GC'd node.
   777  	var badAllocs []*structs.Allocation
   778  	for _, job := range []*structs.Job{job, sysjob, bjob} {
   779  		alloc := mock.Alloc()
   780  		alloc.Namespace = job.Namespace
   781  		alloc.Job = job
   782  		alloc.JobID = job.ID
   783  		alloc.NodeID = uuid.Generate()
   784  		alloc.TaskGroup = job.TaskGroups[0].Name
   785  		alloc.DesiredStatus = structs.AllocDesiredStatusStop
   786  		alloc.ClientStatus = structs.AllocClientStatusComplete
   787  		badAllocs = append(badAllocs, alloc)
   788  	}
   789  	require.NoError(state.UpsertAllocs(1, badAllocs))
   790  
   791  	// Create the second node
   792  	nodeReg = &structs.NodeRegisterRequest{
   793  		Node:         n2,
   794  		WriteRequest: structs.WriteRequest{Region: "global"},
   795  	}
   796  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   797  
   798  	// Drain the node
   799  	drainReq := &structs.NodeUpdateDrainRequest{
   800  		NodeID: n1.ID,
   801  		DrainStrategy: &structs.DrainStrategy{
   802  			DrainSpec: structs.DrainSpec{
   803  				Deadline: 2 * time.Second,
   804  			},
   805  		},
   806  		WriteRequest: structs.WriteRequest{Region: "global"},
   807  	}
   808  	var drainResp structs.NodeDrainUpdateResponse
   809  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   810  
   811  	// Wait for the allocs to be replaced
   812  	errCh := make(chan error, 2)
   813  	ctx, cancel := context.WithCancel(context.Background())
   814  	defer cancel()
   815  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   816  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   817  
   818  	// Wait for the allocs to be stopped
   819  	var finalAllocs []*structs.Allocation
   820  	testutil.WaitForResult(func() (bool, error) {
   821  		if err := checkAllocPromoter(errCh); err != nil {
   822  			return false, err
   823  		}
   824  
   825  		var err error
   826  		finalAllocs, err = state.AllocsByNode(nil, n1.ID)
   827  		if err != nil {
   828  			return false, err
   829  		}
   830  		for _, alloc := range finalAllocs {
   831  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   832  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   833  			}
   834  		}
   835  		return true, nil
   836  	}, func(err error) {
   837  		t.Fatalf("err: %v", err)
   838  	})
   839  
   840  	// Check that the node drain is removed
   841  	testutil.WaitForResult(func() (bool, error) {
   842  		node, err := state.NodeByID(nil, n1.ID)
   843  		if err != nil {
   844  			return false, err
   845  		}
   846  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   847  	}, func(err error) {
   848  		t.Fatalf("err: %v", err)
   849  	})
   850  
   851  	// Wait for the allocations to be placed on the other node
   852  	testutil.WaitForResult(func() (bool, error) {
   853  		allocs, err := state.AllocsByNode(nil, n2.ID)
   854  		if err != nil {
   855  			return false, err
   856  		}
   857  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   858  	}, func(err error) {
   859  		t.Fatalf("err: %v", err)
   860  	})
   861  
   862  	// Check we got the right events
   863  	node, err := state.NodeByID(nil, n1.ID)
   864  	require.NoError(err)
   865  	require.Len(node.Events, 3)
   866  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   867  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   868  }
   869  
   870  // Test that transitions to force drain work.
   871  func TestDrainer_Batch_TransitionToForce(t *testing.T) {
   872  	t.Parallel()
   873  
   874  	for _, inf := range []bool{true, false} {
   875  		name := "Infinite"
   876  		if !inf {
   877  			name = "Deadline"
   878  		}
   879  		t.Run(name, func(t *testing.T) {
   880  			require := require.New(t)
   881  			s1 := TestServer(t, nil)
   882  			defer s1.Shutdown()
   883  			codec := rpcClient(t, s1)
   884  			testutil.WaitForLeader(t, s1.RPC)
   885  
   886  			// Create a node
   887  			n1 := mock.Node()
   888  			nodeReg := &structs.NodeRegisterRequest{
   889  				Node:         n1,
   890  				WriteRequest: structs.WriteRequest{Region: "global"},
   891  			}
   892  			var nodeResp structs.NodeUpdateResponse
   893  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   894  
   895  			// Create a batch job
   896  			bjob := mock.BatchJob()
   897  			bjob.TaskGroups[0].Count = 2
   898  			req := &structs.JobRegisterRequest{
   899  				Job: bjob,
   900  				WriteRequest: structs.WriteRequest{
   901  					Region:    "global",
   902  					Namespace: bjob.Namespace,
   903  				},
   904  			}
   905  
   906  			// Fetch the response
   907  			var resp structs.JobRegisterResponse
   908  			require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   909  			require.NotZero(resp.Index)
   910  
   911  			// Wait for the allocations to be placed
   912  			state := s1.State()
   913  			testutil.WaitForResult(func() (bool, error) {
   914  				allocs, err := state.AllocsByNode(nil, n1.ID)
   915  				if err != nil {
   916  					return false, err
   917  				}
   918  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   919  			}, func(err error) {
   920  				t.Fatalf("err: %v", err)
   921  			})
   922  
   923  			// Pick the deadline
   924  			deadline := 0 * time.Second
   925  			if !inf {
   926  				deadline = 10 * time.Second
   927  			}
   928  
   929  			// Drain the node
   930  			drainReq := &structs.NodeUpdateDrainRequest{
   931  				NodeID: n1.ID,
   932  				DrainStrategy: &structs.DrainStrategy{
   933  					DrainSpec: structs.DrainSpec{
   934  						Deadline: deadline,
   935  					},
   936  				},
   937  				WriteRequest: structs.WriteRequest{Region: "global"},
   938  			}
   939  			var drainResp structs.NodeDrainUpdateResponse
   940  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   941  
   942  			// Wait for the allocs to be replaced
   943  			errCh := make(chan error, 1)
   944  			ctx, cancel := context.WithCancel(context.Background())
   945  			defer cancel()
   946  			go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   947  
   948  			// Make sure the batch job isn't affected
   949  			testutil.AssertUntil(500*time.Millisecond, func() (bool, error) {
   950  				if err := checkAllocPromoter(errCh); err != nil {
   951  					return false, fmt.Errorf("check alloc promoter error: %v", err)
   952  				}
   953  
   954  				allocs, err := state.AllocsByNode(nil, n1.ID)
   955  				if err != nil {
   956  					return false, fmt.Errorf("AllocsByNode error: %v", err)
   957  				}
   958  				for _, alloc := range allocs {
   959  					if alloc.DesiredStatus != structs.AllocDesiredStatusRun {
   960  						return false, fmt.Errorf("got status %v", alloc.DesiredStatus)
   961  					}
   962  				}
   963  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   964  			}, func(err error) {
   965  				t.Fatalf("err: %v", err)
   966  			})
   967  
   968  			// Foce drain the node
   969  			drainReq = &structs.NodeUpdateDrainRequest{
   970  				NodeID: n1.ID,
   971  				DrainStrategy: &structs.DrainStrategy{
   972  					DrainSpec: structs.DrainSpec{
   973  						Deadline: -1 * time.Second, // Infinite
   974  					},
   975  				},
   976  				WriteRequest: structs.WriteRequest{Region: "global"},
   977  			}
   978  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   979  
   980  			// Make sure the batch job is migrated
   981  			testutil.WaitForResult(func() (bool, error) {
   982  				allocs, err := state.AllocsByNode(nil, n1.ID)
   983  				if err != nil {
   984  					return false, err
   985  				}
   986  				for _, alloc := range allocs {
   987  					if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   988  						return false, fmt.Errorf("got status %v", alloc.DesiredStatus)
   989  					}
   990  				}
   991  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   992  			}, func(err error) {
   993  				t.Fatalf("err: %v", err)
   994  			})
   995  
   996  			// Check that the node drain is removed
   997  			testutil.WaitForResult(func() (bool, error) {
   998  				node, err := state.NodeByID(nil, n1.ID)
   999  				if err != nil {
  1000  					return false, err
  1001  				}
  1002  				return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
  1003  			}, func(err error) {
  1004  				t.Fatalf("err: %v", err)
  1005  			})
  1006  
  1007  			// Check we got the right events
  1008  			node, err := state.NodeByID(nil, n1.ID)
  1009  			require.NoError(err)
  1010  			require.Len(node.Events, 4)
  1011  			require.Equal(drainer.NodeDrainEventComplete, node.Events[3].Message)
  1012  			require.Contains(node.Events[3].Details, drainer.NodeDrainEventDetailDeadlined)
  1013  		})
  1014  	}
  1015  }