github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/drainer_int_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net/rpc"
     7  	"testing"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
    13  
    14  	"github.com/hashicorp/nomad/helper"
    15  	"github.com/hashicorp/nomad/helper/uuid"
    16  	"github.com/hashicorp/nomad/nomad/drainer"
    17  	"github.com/hashicorp/nomad/nomad/mock"
    18  	"github.com/hashicorp/nomad/nomad/state"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  	"github.com/hashicorp/nomad/testutil"
    21  	"github.com/stretchr/testify/require"
    22  )
    23  
    24  func allocPromoter(errCh chan<- error, ctx context.Context,
    25  	state *state.StateStore, codec rpc.ClientCodec, nodeID string,
    26  	logger log.Logger) {
    27  
    28  	nindex := uint64(1)
    29  	for {
    30  		allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex)
    31  		if err != nil {
    32  			if err == context.Canceled {
    33  				return
    34  			}
    35  
    36  			errCh <- fmt.Errorf("failed to get node allocs: %v", err)
    37  			return
    38  		}
    39  		nindex = index
    40  
    41  		// For each alloc that doesn't have its deployment status set, set it
    42  		var updates []*structs.Allocation
    43  		now := time.Now()
    44  		for _, alloc := range allocs {
    45  			if alloc.Job.Type != structs.JobTypeService {
    46  				continue
    47  			}
    48  
    49  			if alloc.DeploymentStatus.HasHealth() {
    50  				continue
    51  			}
    52  			newAlloc := alloc.Copy()
    53  			newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{
    54  				Healthy:   helper.BoolToPtr(true),
    55  				Timestamp: now,
    56  			}
    57  			updates = append(updates, newAlloc)
    58  			logger.Trace("marked deployment health for alloc", "alloc_id", alloc.ID)
    59  		}
    60  
    61  		if len(updates) == 0 {
    62  			continue
    63  		}
    64  
    65  		// Send the update
    66  		req := &structs.AllocUpdateRequest{
    67  			Alloc:        updates,
    68  			WriteRequest: structs.WriteRequest{Region: "global"},
    69  		}
    70  		var resp structs.GenericResponse
    71  		if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil {
    72  			if ctx.Err() == context.Canceled {
    73  				return
    74  			} else if err != nil {
    75  				errCh <- err
    76  			}
    77  		}
    78  	}
    79  }
    80  
    81  // checkAllocPromoter is a small helper to return an error or nil from an error
    82  // chan like the one given to the allocPromoter goroutine.
    83  func checkAllocPromoter(errCh chan error) error {
    84  	select {
    85  	case err := <-errCh:
    86  		return err
    87  	default:
    88  		return nil
    89  	}
    90  }
    91  
    92  func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) {
    93  	resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx)
    94  	if err != nil {
    95  		return nil, 0, err
    96  	}
    97  	if err := ctx.Err(); err != nil {
    98  		return nil, 0, err
    99  	}
   100  
   101  	return resp.([]*structs.Allocation), index, nil
   102  }
   103  
   104  func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   105  	return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   106  		// Capture all the allocations
   107  		allocs, err := state.AllocsByNode(ws, nodeID)
   108  		if err != nil {
   109  			return nil, 0, err
   110  		}
   111  
   112  		// Use the last index that affected the jobs table
   113  		index, err := state.Index("allocs")
   114  		if err != nil {
   115  			return nil, index, err
   116  		}
   117  
   118  		return allocs, index, nil
   119  	}
   120  }
   121  
   122  func TestDrainer_Simple_ServiceOnly(t *testing.T) {
   123  	t.Parallel()
   124  	require := require.New(t)
   125  
   126  	s1, cleanupS1 := TestServer(t, nil)
   127  	defer cleanupS1()
   128  	codec := rpcClient(t, s1)
   129  	testutil.WaitForLeader(t, s1.RPC)
   130  
   131  	// Create two nodes
   132  	n1, n2 := mock.Node(), mock.Node()
   133  	nodeReg := &structs.NodeRegisterRequest{
   134  		Node:         n1,
   135  		WriteRequest: structs.WriteRequest{Region: "global"},
   136  	}
   137  	var nodeResp structs.NodeUpdateResponse
   138  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   139  
   140  	// Create a job that runs on just one
   141  	job := mock.Job()
   142  	job.TaskGroups[0].Count = 2
   143  	req := &structs.JobRegisterRequest{
   144  		Job: job,
   145  		WriteRequest: structs.WriteRequest{
   146  			Region:    "global",
   147  			Namespace: job.Namespace,
   148  		},
   149  	}
   150  
   151  	// Fetch the response
   152  	var resp structs.JobRegisterResponse
   153  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   154  	require.NotZero(resp.Index)
   155  
   156  	// Wait for the two allocations to be placed
   157  	state := s1.State()
   158  	testutil.WaitForResult(func() (bool, error) {
   159  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   160  		if err != nil {
   161  			return false, err
   162  		}
   163  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   164  	}, func(err error) {
   165  		t.Fatalf("err: %v", err)
   166  	})
   167  
   168  	// Create the second node
   169  	nodeReg = &structs.NodeRegisterRequest{
   170  		Node:         n2,
   171  		WriteRequest: structs.WriteRequest{Region: "global"},
   172  	}
   173  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   174  
   175  	// Drain the first node
   176  	drainReq := &structs.NodeUpdateDrainRequest{
   177  		NodeID: n1.ID,
   178  		DrainStrategy: &structs.DrainStrategy{
   179  			DrainSpec: structs.DrainSpec{
   180  				Deadline: 10 * time.Minute,
   181  			},
   182  		},
   183  		WriteRequest: structs.WriteRequest{Region: "global"},
   184  	}
   185  	var drainResp structs.NodeDrainUpdateResponse
   186  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   187  
   188  	// Wait for the allocs to be replaced
   189  	errCh := make(chan error, 2)
   190  	ctx, cancel := context.WithCancel(context.Background())
   191  	defer cancel()
   192  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   193  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   194  
   195  	testutil.WaitForResult(func() (bool, error) {
   196  		allocs, err := state.AllocsByNode(nil, n2.ID)
   197  		if err != nil {
   198  			return false, err
   199  		}
   200  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   201  	}, func(err error) {
   202  		t.Fatalf("err: %v", err)
   203  	})
   204  
   205  	// Check that the node drain is removed
   206  	testutil.WaitForResult(func() (bool, error) {
   207  		if err := checkAllocPromoter(errCh); err != nil {
   208  			return false, err
   209  		}
   210  		node, err := state.NodeByID(nil, n1.ID)
   211  		if err != nil {
   212  			return false, err
   213  		}
   214  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   215  	}, func(err error) {
   216  		t.Fatalf("err: %v", err)
   217  	})
   218  
   219  	// Check we got the right events
   220  	node, err := state.NodeByID(nil, n1.ID)
   221  	require.NoError(err)
   222  	require.Len(node.Events, 3)
   223  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   224  }
   225  
   226  func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) {
   227  	t.Parallel()
   228  	require := require.New(t)
   229  
   230  	s1, cleanupS1 := TestServer(t, nil)
   231  	defer cleanupS1()
   232  	codec := rpcClient(t, s1)
   233  	testutil.WaitForLeader(t, s1.RPC)
   234  
   235  	// Create a node
   236  	n1 := mock.Node()
   237  	nodeReg := &structs.NodeRegisterRequest{
   238  		Node:         n1,
   239  		WriteRequest: structs.WriteRequest{Region: "global"},
   240  	}
   241  	var nodeResp structs.NodeUpdateResponse
   242  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   243  
   244  	// Create a job that runs on just one
   245  	job := mock.Job()
   246  	job.Update = *structs.DefaultUpdateStrategy
   247  	job.Update.Stagger = 30 * time.Second
   248  	job.TaskGroups[0].Count = 2
   249  	req := &structs.JobRegisterRequest{
   250  		Job: job,
   251  		WriteRequest: structs.WriteRequest{
   252  			Region:    "global",
   253  			Namespace: job.Namespace,
   254  		},
   255  	}
   256  
   257  	// Fetch the response
   258  	var resp structs.JobRegisterResponse
   259  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   260  	require.NotZero(resp.Index)
   261  
   262  	// Wait for the two allocations to be placed
   263  	state := s1.State()
   264  	testutil.WaitForResult(func() (bool, error) {
   265  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   266  		if err != nil {
   267  			return false, err
   268  		}
   269  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   270  	}, func(err error) {
   271  		t.Fatalf("err: %v", err)
   272  	})
   273  
   274  	// Drain the node
   275  	drainReq := &structs.NodeUpdateDrainRequest{
   276  		NodeID: n1.ID,
   277  		DrainStrategy: &structs.DrainStrategy{
   278  			DrainSpec: structs.DrainSpec{
   279  				Deadline: 1 * time.Second,
   280  			},
   281  		},
   282  		WriteRequest: structs.WriteRequest{Region: "global"},
   283  	}
   284  	var drainResp structs.NodeDrainUpdateResponse
   285  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   286  
   287  	// Wait for the allocs to be stopped
   288  	testutil.WaitForResult(func() (bool, error) {
   289  		allocs, err := state.AllocsByNode(nil, n1.ID)
   290  		if err != nil {
   291  			return false, err
   292  		}
   293  		for _, alloc := range allocs {
   294  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   295  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   296  			}
   297  		}
   298  		return true, nil
   299  	}, func(err error) {
   300  		t.Fatalf("err: %v", err)
   301  	})
   302  
   303  	// Check that the node drain is removed
   304  	testutil.WaitForResult(func() (bool, error) {
   305  		node, err := state.NodeByID(nil, n1.ID)
   306  		if err != nil {
   307  			return false, err
   308  		}
   309  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   310  	}, func(err error) {
   311  		t.Fatalf("err: %v", err)
   312  	})
   313  
   314  	// Check we got the right events
   315  	node, err := state.NodeByID(nil, n1.ID)
   316  	require.NoError(err)
   317  	require.Len(node.Events, 3)
   318  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   319  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   320  }
   321  
   322  func TestDrainer_DrainEmptyNode(t *testing.T) {
   323  	t.Parallel()
   324  	require := require.New(t)
   325  
   326  	s1, cleanupS1 := TestServer(t, nil)
   327  	defer cleanupS1()
   328  	codec := rpcClient(t, s1)
   329  	testutil.WaitForLeader(t, s1.RPC)
   330  
   331  	// Create a node
   332  	n1 := mock.Node()
   333  	nodeReg := &structs.NodeRegisterRequest{
   334  		Node:         n1,
   335  		WriteRequest: structs.WriteRequest{Region: "global"},
   336  	}
   337  	var nodeResp structs.NodeUpdateResponse
   338  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   339  
   340  	// Drain the node
   341  	drainReq := &structs.NodeUpdateDrainRequest{
   342  		NodeID: n1.ID,
   343  		DrainStrategy: &structs.DrainStrategy{
   344  			DrainSpec: structs.DrainSpec{
   345  				Deadline: 10 * time.Minute,
   346  			},
   347  		},
   348  		WriteRequest: structs.WriteRequest{Region: "global"},
   349  	}
   350  	var drainResp structs.NodeDrainUpdateResponse
   351  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   352  
   353  	// Check that the node drain is removed
   354  	state := s1.State()
   355  	testutil.WaitForResult(func() (bool, error) {
   356  		node, err := state.NodeByID(nil, n1.ID)
   357  		if err != nil {
   358  			return false, err
   359  		}
   360  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   361  	}, func(err error) {
   362  		t.Fatalf("err: %v", err)
   363  	})
   364  
   365  	// Check we got the right events
   366  	node, err := state.NodeByID(nil, n1.ID)
   367  	require.NoError(err)
   368  	require.Len(node.Events, 3)
   369  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   370  }
   371  
   372  func TestDrainer_AllTypes_Deadline(t *testing.T) {
   373  	t.Parallel()
   374  	require := require.New(t)
   375  
   376  	s1, cleanupS1 := TestServer(t, nil)
   377  	defer cleanupS1()
   378  	codec := rpcClient(t, s1)
   379  	testutil.WaitForLeader(t, s1.RPC)
   380  
   381  	// Create two nodes, registering the second later
   382  	n1, n2 := mock.Node(), mock.Node()
   383  	nodeReg := &structs.NodeRegisterRequest{
   384  		Node:         n1,
   385  		WriteRequest: structs.WriteRequest{Region: "global"},
   386  	}
   387  	var nodeResp structs.NodeUpdateResponse
   388  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   389  
   390  	// Create a service job that runs on just one
   391  	job := mock.Job()
   392  	job.TaskGroups[0].Count = 2
   393  	req := &structs.JobRegisterRequest{
   394  		Job: job,
   395  		WriteRequest: structs.WriteRequest{
   396  			Region:    "global",
   397  			Namespace: job.Namespace,
   398  		},
   399  	}
   400  
   401  	// Fetch the response
   402  	var resp structs.JobRegisterResponse
   403  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   404  	require.NotZero(resp.Index)
   405  
   406  	// Create a system job
   407  	sysjob := mock.SystemJob()
   408  	req = &structs.JobRegisterRequest{
   409  		Job: sysjob,
   410  		WriteRequest: structs.WriteRequest{
   411  			Region:    "global",
   412  			Namespace: job.Namespace,
   413  		},
   414  	}
   415  
   416  	// Fetch the response
   417  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   418  	require.NotZero(resp.Index)
   419  
   420  	// Create a batch job
   421  	bjob := mock.BatchJob()
   422  	bjob.TaskGroups[0].Count = 2
   423  	req = &structs.JobRegisterRequest{
   424  		Job: bjob,
   425  		WriteRequest: structs.WriteRequest{
   426  			Region:    "global",
   427  			Namespace: job.Namespace,
   428  		},
   429  	}
   430  
   431  	// Fetch the response
   432  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   433  	require.NotZero(resp.Index)
   434  
   435  	// Wait for the allocations to be placed
   436  	state := s1.State()
   437  	testutil.WaitForResult(func() (bool, error) {
   438  		allocs, err := state.AllocsByNode(nil, n1.ID)
   439  		if err != nil {
   440  			return false, err
   441  		}
   442  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   443  	}, func(err error) {
   444  		t.Fatalf("err: %v", err)
   445  	})
   446  
   447  	// Create the second node
   448  	nodeReg = &structs.NodeRegisterRequest{
   449  		Node:         n2,
   450  		WriteRequest: structs.WriteRequest{Region: "global"},
   451  	}
   452  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   453  
   454  	// Drain the node
   455  	drainReq := &structs.NodeUpdateDrainRequest{
   456  		NodeID: n1.ID,
   457  		DrainStrategy: &structs.DrainStrategy{
   458  			DrainSpec: structs.DrainSpec{
   459  				Deadline: 2 * time.Second,
   460  			},
   461  		},
   462  		WriteRequest: structs.WriteRequest{Region: "global"},
   463  	}
   464  	var drainResp structs.NodeDrainUpdateResponse
   465  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   466  
   467  	// Wait for the allocs to be replaced
   468  	errCh := make(chan error, 2)
   469  	ctx, cancel := context.WithCancel(context.Background())
   470  	defer cancel()
   471  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   472  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   473  
   474  	// Wait for the allocs to be stopped
   475  	var finalAllocs []*structs.Allocation
   476  	testutil.WaitForResult(func() (bool, error) {
   477  		if err := checkAllocPromoter(errCh); err != nil {
   478  			return false, err
   479  		}
   480  
   481  		var err error
   482  		finalAllocs, err = state.AllocsByNode(nil, n1.ID)
   483  		if err != nil {
   484  			return false, err
   485  		}
   486  		for _, alloc := range finalAllocs {
   487  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   488  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   489  			}
   490  		}
   491  		return true, nil
   492  	}, func(err error) {
   493  		t.Fatalf("err: %v", err)
   494  	})
   495  
   496  	// Check that the node drain is removed
   497  	testutil.WaitForResult(func() (bool, error) {
   498  		node, err := state.NodeByID(nil, n1.ID)
   499  		if err != nil {
   500  			return false, err
   501  		}
   502  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   503  	}, func(err error) {
   504  		t.Fatalf("err: %v", err)
   505  	})
   506  
   507  	// Wait for the allocations to be placed on the other node
   508  	testutil.WaitForResult(func() (bool, error) {
   509  		allocs, err := state.AllocsByNode(nil, n2.ID)
   510  		if err != nil {
   511  			return false, err
   512  		}
   513  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   514  	}, func(err error) {
   515  		t.Fatalf("err: %v", err)
   516  	})
   517  
   518  	// Assert that the service finished before the batch and system
   519  	var serviceMax, batchMax uint64 = 0, 0
   520  	for _, alloc := range finalAllocs {
   521  		if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax {
   522  			serviceMax = alloc.ModifyIndex
   523  		} else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax {
   524  			batchMax = alloc.ModifyIndex
   525  		}
   526  	}
   527  	require.True(serviceMax < batchMax)
   528  
   529  	// Check we got the right events
   530  	node, err := state.NodeByID(nil, n1.ID)
   531  	require.NoError(err)
   532  	require.Len(node.Events, 3)
   533  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   534  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   535  }
   536  
   537  // Test that drain is unset when batch jobs naturally finish
   538  func TestDrainer_AllTypes_NoDeadline(t *testing.T) {
   539  	t.Parallel()
   540  	require := require.New(t)
   541  
   542  	s1, cleanupS1 := TestServer(t, nil)
   543  	defer cleanupS1()
   544  	codec := rpcClient(t, s1)
   545  	testutil.WaitForLeader(t, s1.RPC)
   546  
   547  	// Create two nodes, registering the second later
   548  	n1, n2 := mock.Node(), mock.Node()
   549  	nodeReg := &structs.NodeRegisterRequest{
   550  		Node:         n1,
   551  		WriteRequest: structs.WriteRequest{Region: "global"},
   552  	}
   553  	var nodeResp structs.NodeUpdateResponse
   554  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   555  
   556  	// Create a service job that runs on just one
   557  	job := mock.Job()
   558  	job.TaskGroups[0].Count = 2
   559  	req := &structs.JobRegisterRequest{
   560  		Job: job,
   561  		WriteRequest: structs.WriteRequest{
   562  			Region:    "global",
   563  			Namespace: job.Namespace,
   564  		},
   565  	}
   566  
   567  	// Fetch the response
   568  	var resp structs.JobRegisterResponse
   569  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   570  	require.NotZero(resp.Index)
   571  
   572  	// Create a system job
   573  	sysjob := mock.SystemJob()
   574  	req = &structs.JobRegisterRequest{
   575  		Job: sysjob,
   576  		WriteRequest: structs.WriteRequest{
   577  			Region:    "global",
   578  			Namespace: job.Namespace,
   579  		},
   580  	}
   581  
   582  	// Fetch the response
   583  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   584  	require.NotZero(resp.Index)
   585  
   586  	// Create a batch job
   587  	bjob := mock.BatchJob()
   588  	bjob.TaskGroups[0].Count = 2
   589  	req = &structs.JobRegisterRequest{
   590  		Job: bjob,
   591  		WriteRequest: structs.WriteRequest{
   592  			Region:    "global",
   593  			Namespace: job.Namespace,
   594  		},
   595  	}
   596  
   597  	// Fetch the response
   598  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   599  	require.NotZero(resp.Index)
   600  
   601  	// Wait for the allocations to be placed
   602  	state := s1.State()
   603  	testutil.WaitForResult(func() (bool, error) {
   604  		allocs, err := state.AllocsByNode(nil, n1.ID)
   605  		if err != nil {
   606  			return false, err
   607  		}
   608  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   609  	}, func(err error) {
   610  		t.Fatalf("err: %v", err)
   611  	})
   612  
   613  	// Create the second node
   614  	nodeReg = &structs.NodeRegisterRequest{
   615  		Node:         n2,
   616  		WriteRequest: structs.WriteRequest{Region: "global"},
   617  	}
   618  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   619  
   620  	// Drain the node
   621  	drainReq := &structs.NodeUpdateDrainRequest{
   622  		NodeID: n1.ID,
   623  		DrainStrategy: &structs.DrainStrategy{
   624  			DrainSpec: structs.DrainSpec{
   625  				Deadline: 0 * time.Second, // Infinite
   626  			},
   627  		},
   628  		WriteRequest: structs.WriteRequest{Region: "global"},
   629  	}
   630  	var drainResp structs.NodeDrainUpdateResponse
   631  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   632  
   633  	// Wait for the allocs to be replaced
   634  	errCh := make(chan error, 2)
   635  	ctx, cancel := context.WithCancel(context.Background())
   636  	defer cancel()
   637  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   638  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   639  
   640  	// Wait for the service allocs to be stopped on the draining node
   641  	testutil.WaitForResult(func() (bool, error) {
   642  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   643  		if err != nil {
   644  			return false, err
   645  		}
   646  		for _, alloc := range allocs {
   647  			if alloc.NodeID != n1.ID {
   648  				continue
   649  			}
   650  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   651  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   652  			}
   653  		}
   654  		if err := checkAllocPromoter(errCh); err != nil {
   655  			return false, err
   656  		}
   657  		return true, nil
   658  	}, func(err error) {
   659  		t.Fatalf("err: %v", err)
   660  	})
   661  
   662  	// Mark the batch allocations as finished
   663  	allocs, err := state.AllocsByJob(nil, job.Namespace, bjob.ID, false)
   664  	require.Nil(err)
   665  
   666  	var updates []*structs.Allocation
   667  	for _, alloc := range allocs {
   668  		new := alloc.Copy()
   669  		new.ClientStatus = structs.AllocClientStatusComplete
   670  		updates = append(updates, new)
   671  	}
   672  	require.Nil(state.UpdateAllocsFromClient(1000, updates))
   673  
   674  	// Check that the node drain is removed
   675  	testutil.WaitForResult(func() (bool, error) {
   676  		node, err := state.NodeByID(nil, n1.ID)
   677  		if err != nil {
   678  			return false, err
   679  		}
   680  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   681  	}, func(err error) {
   682  		t.Fatalf("err: %v", err)
   683  	})
   684  
   685  	// Wait for the service allocations to be placed on the other node
   686  	testutil.WaitForResult(func() (bool, error) {
   687  		allocs, err := state.AllocsByNode(nil, n2.ID)
   688  		if err != nil {
   689  			return false, err
   690  		}
   691  		return len(allocs) == 3, fmt.Errorf("got %d allocs", len(allocs))
   692  	}, func(err error) {
   693  		t.Fatalf("err: %v", err)
   694  	})
   695  
   696  	// Check we got the right events
   697  	node, err := state.NodeByID(nil, n1.ID)
   698  	require.NoError(err)
   699  	require.Len(node.Events, 3)
   700  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   701  }
   702  
   703  func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) {
   704  	t.Parallel()
   705  	require := require.New(t)
   706  
   707  	s1, cleanupS1 := TestServer(t, nil)
   708  	defer cleanupS1()
   709  	codec := rpcClient(t, s1)
   710  	testutil.WaitForLeader(t, s1.RPC)
   711  
   712  	// Create two nodes, registering the second later
   713  	n1, n2 := mock.Node(), mock.Node()
   714  	nodeReg := &structs.NodeRegisterRequest{
   715  		Node:         n1,
   716  		WriteRequest: structs.WriteRequest{Region: "global"},
   717  	}
   718  	var nodeResp structs.NodeUpdateResponse
   719  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   720  
   721  	// Create a service job that runs on just one
   722  	job := mock.Job()
   723  	job.TaskGroups[0].Count = 2
   724  	req := &structs.JobRegisterRequest{
   725  		Job: job,
   726  		WriteRequest: structs.WriteRequest{
   727  			Region:    "global",
   728  			Namespace: job.Namespace,
   729  		},
   730  	}
   731  
   732  	// Fetch the response
   733  	var resp structs.JobRegisterResponse
   734  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   735  	require.NotZero(resp.Index)
   736  	job.CreateIndex = resp.JobModifyIndex
   737  
   738  	// Create a system job
   739  	sysjob := mock.SystemJob()
   740  	req = &structs.JobRegisterRequest{
   741  		Job: sysjob,
   742  		WriteRequest: structs.WriteRequest{
   743  			Region:    "global",
   744  			Namespace: job.Namespace,
   745  		},
   746  	}
   747  
   748  	// Fetch the response
   749  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   750  	require.NotZero(resp.Index)
   751  	sysjob.CreateIndex = resp.JobModifyIndex
   752  
   753  	// Create a batch job
   754  	bjob := mock.BatchJob()
   755  	bjob.TaskGroups[0].Count = 2
   756  	req = &structs.JobRegisterRequest{
   757  		Job: bjob,
   758  		WriteRequest: structs.WriteRequest{
   759  			Region:    "global",
   760  			Namespace: job.Namespace,
   761  		},
   762  	}
   763  
   764  	// Fetch the response
   765  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   766  	require.NotZero(resp.Index)
   767  	bjob.CreateIndex = resp.JobModifyIndex
   768  
   769  	// Wait for the allocations to be placed
   770  	state := s1.State()
   771  	testutil.WaitForResult(func() (bool, error) {
   772  		allocs, err := state.AllocsByNode(nil, n1.ID)
   773  		if err != nil {
   774  			return false, err
   775  		}
   776  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   777  	}, func(err error) {
   778  		t.Fatalf("err: %v", err)
   779  	})
   780  
   781  	// Create some old terminal allocs for each job that point at a non-existent
   782  	// node to simulate it being on a GC'd node.
   783  	var badAllocs []*structs.Allocation
   784  	for _, job := range []*structs.Job{job, sysjob, bjob} {
   785  		alloc := mock.Alloc()
   786  		alloc.Namespace = job.Namespace
   787  		alloc.Job = job
   788  		alloc.JobID = job.ID
   789  		alloc.NodeID = uuid.Generate()
   790  		alloc.TaskGroup = job.TaskGroups[0].Name
   791  		alloc.DesiredStatus = structs.AllocDesiredStatusStop
   792  		alloc.ClientStatus = structs.AllocClientStatusComplete
   793  		badAllocs = append(badAllocs, alloc)
   794  	}
   795  	require.NoError(state.UpsertAllocs(1, badAllocs))
   796  
   797  	// Create the second node
   798  	nodeReg = &structs.NodeRegisterRequest{
   799  		Node:         n2,
   800  		WriteRequest: structs.WriteRequest{Region: "global"},
   801  	}
   802  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   803  
   804  	// Drain the node
   805  	drainReq := &structs.NodeUpdateDrainRequest{
   806  		NodeID: n1.ID,
   807  		DrainStrategy: &structs.DrainStrategy{
   808  			DrainSpec: structs.DrainSpec{
   809  				Deadline: 2 * time.Second,
   810  			},
   811  		},
   812  		WriteRequest: structs.WriteRequest{Region: "global"},
   813  	}
   814  	var drainResp structs.NodeDrainUpdateResponse
   815  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   816  
   817  	// Wait for the allocs to be replaced
   818  	errCh := make(chan error, 2)
   819  	ctx, cancel := context.WithCancel(context.Background())
   820  	defer cancel()
   821  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   822  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   823  
   824  	// Wait for the allocs to be stopped
   825  	var finalAllocs []*structs.Allocation
   826  	testutil.WaitForResult(func() (bool, error) {
   827  		if err := checkAllocPromoter(errCh); err != nil {
   828  			return false, err
   829  		}
   830  
   831  		var err error
   832  		finalAllocs, err = state.AllocsByNode(nil, n1.ID)
   833  		if err != nil {
   834  			return false, err
   835  		}
   836  		for _, alloc := range finalAllocs {
   837  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   838  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   839  			}
   840  		}
   841  		return true, nil
   842  	}, func(err error) {
   843  		t.Fatalf("err: %v", err)
   844  	})
   845  
   846  	// Check that the node drain is removed
   847  	testutil.WaitForResult(func() (bool, error) {
   848  		node, err := state.NodeByID(nil, n1.ID)
   849  		if err != nil {
   850  			return false, err
   851  		}
   852  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   853  	}, func(err error) {
   854  		t.Fatalf("err: %v", err)
   855  	})
   856  
   857  	// Wait for the allocations to be placed on the other node
   858  	testutil.WaitForResult(func() (bool, error) {
   859  		allocs, err := state.AllocsByNode(nil, n2.ID)
   860  		if err != nil {
   861  			return false, err
   862  		}
   863  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   864  	}, func(err error) {
   865  		t.Fatalf("err: %v", err)
   866  	})
   867  
   868  	// Check we got the right events
   869  	node, err := state.NodeByID(nil, n1.ID)
   870  	require.NoError(err)
   871  	require.Len(node.Events, 3)
   872  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   873  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   874  }
   875  
   876  // Test that transitions to force drain work.
   877  func TestDrainer_Batch_TransitionToForce(t *testing.T) {
   878  	t.Parallel()
   879  
   880  	for _, inf := range []bool{true, false} {
   881  		name := "Infinite"
   882  		if !inf {
   883  			name = "Deadline"
   884  		}
   885  		t.Run(name, func(t *testing.T) {
   886  			require := require.New(t)
   887  			s1, cleanupS1 := TestServer(t, nil)
   888  			defer cleanupS1()
   889  			codec := rpcClient(t, s1)
   890  			testutil.WaitForLeader(t, s1.RPC)
   891  
   892  			// Create a node
   893  			n1 := mock.Node()
   894  			nodeReg := &structs.NodeRegisterRequest{
   895  				Node:         n1,
   896  				WriteRequest: structs.WriteRequest{Region: "global"},
   897  			}
   898  			var nodeResp structs.NodeUpdateResponse
   899  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   900  
   901  			// Create a batch job
   902  			bjob := mock.BatchJob()
   903  			bjob.TaskGroups[0].Count = 2
   904  			req := &structs.JobRegisterRequest{
   905  				Job: bjob,
   906  				WriteRequest: structs.WriteRequest{
   907  					Region:    "global",
   908  					Namespace: bjob.Namespace,
   909  				},
   910  			}
   911  
   912  			// Fetch the response
   913  			var resp structs.JobRegisterResponse
   914  			require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   915  			require.NotZero(resp.Index)
   916  
   917  			// Wait for the allocations to be placed
   918  			state := s1.State()
   919  			testutil.WaitForResult(func() (bool, error) {
   920  				allocs, err := state.AllocsByNode(nil, n1.ID)
   921  				if err != nil {
   922  					return false, err
   923  				}
   924  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   925  			}, func(err error) {
   926  				t.Fatalf("err: %v", err)
   927  			})
   928  
   929  			// Pick the deadline
   930  			deadline := 0 * time.Second
   931  			if !inf {
   932  				deadline = 10 * time.Second
   933  			}
   934  
   935  			// Drain the node
   936  			drainReq := &structs.NodeUpdateDrainRequest{
   937  				NodeID: n1.ID,
   938  				DrainStrategy: &structs.DrainStrategy{
   939  					DrainSpec: structs.DrainSpec{
   940  						Deadline: deadline,
   941  					},
   942  				},
   943  				WriteRequest: structs.WriteRequest{Region: "global"},
   944  			}
   945  			var drainResp structs.NodeDrainUpdateResponse
   946  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   947  
   948  			// Wait for the allocs to be replaced
   949  			errCh := make(chan error, 1)
   950  			ctx, cancel := context.WithCancel(context.Background())
   951  			defer cancel()
   952  			go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   953  
   954  			// Make sure the batch job isn't affected
   955  			testutil.AssertUntil(500*time.Millisecond, func() (bool, error) {
   956  				if err := checkAllocPromoter(errCh); err != nil {
   957  					return false, fmt.Errorf("check alloc promoter error: %v", err)
   958  				}
   959  
   960  				allocs, err := state.AllocsByNode(nil, n1.ID)
   961  				if err != nil {
   962  					return false, fmt.Errorf("AllocsByNode error: %v", err)
   963  				}
   964  				for _, alloc := range allocs {
   965  					if alloc.DesiredStatus != structs.AllocDesiredStatusRun {
   966  						return false, fmt.Errorf("got status %v", alloc.DesiredStatus)
   967  					}
   968  				}
   969  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   970  			}, func(err error) {
   971  				t.Fatalf("err: %v", err)
   972  			})
   973  
   974  			// Foce drain the node
   975  			drainReq = &structs.NodeUpdateDrainRequest{
   976  				NodeID: n1.ID,
   977  				DrainStrategy: &structs.DrainStrategy{
   978  					DrainSpec: structs.DrainSpec{
   979  						Deadline: -1 * time.Second, // Infinite
   980  					},
   981  				},
   982  				WriteRequest: structs.WriteRequest{Region: "global"},
   983  			}
   984  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   985  
   986  			// Make sure the batch job is migrated
   987  			testutil.WaitForResult(func() (bool, error) {
   988  				allocs, err := state.AllocsByNode(nil, n1.ID)
   989  				if err != nil {
   990  					return false, err
   991  				}
   992  				for _, alloc := range allocs {
   993  					if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   994  						return false, fmt.Errorf("got status %v", alloc.DesiredStatus)
   995  					}
   996  				}
   997  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   998  			}, func(err error) {
   999  				t.Fatalf("err: %v", err)
  1000  			})
  1001  
  1002  			// Check that the node drain is removed
  1003  			testutil.WaitForResult(func() (bool, error) {
  1004  				node, err := state.NodeByID(nil, n1.ID)
  1005  				if err != nil {
  1006  					return false, err
  1007  				}
  1008  				return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
  1009  			}, func(err error) {
  1010  				t.Fatalf("err: %v", err)
  1011  			})
  1012  
  1013  			// Check we got the right events
  1014  			node, err := state.NodeByID(nil, n1.ID)
  1015  			require.NoError(err)
  1016  			require.Len(node.Events, 4)
  1017  			require.Equal(drainer.NodeDrainEventComplete, node.Events[3].Message)
  1018  			require.Contains(node.Events[3].Details, drainer.NodeDrainEventDetailDeadlined)
  1019  		})
  1020  	}
  1021  }