github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/drainer_int_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"net/rpc"
     8  	"testing"
     9  	"time"
    10  
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
    13  	"github.com/hashicorp/nomad/helper"
    14  	"github.com/hashicorp/nomad/helper/uuid"
    15  	"github.com/hashicorp/nomad/nomad/drainer"
    16  	"github.com/hashicorp/nomad/nomad/mock"
    17  	"github.com/hashicorp/nomad/nomad/state"
    18  	"github.com/hashicorp/nomad/nomad/structs"
    19  	"github.com/hashicorp/nomad/testutil"
    20  	"github.com/stretchr/testify/require"
    21  )
    22  
    23  func allocPromoter(errCh chan<- error, ctx context.Context,
    24  	state *state.StateStore, codec rpc.ClientCodec, nodeID string,
    25  	logger *log.Logger) {
    26  
    27  	nindex := uint64(1)
    28  	for {
    29  		allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex)
    30  		if err != nil {
    31  			if err == context.Canceled {
    32  				return
    33  			}
    34  
    35  			errCh <- fmt.Errorf("failed to get node allocs: %v", err)
    36  			return
    37  		}
    38  		nindex = index
    39  
    40  		// For each alloc that doesn't have its deployment status set, set it
    41  		var updates []*structs.Allocation
    42  		now := time.Now()
    43  		for _, alloc := range allocs {
    44  			if alloc.Job.Type != structs.JobTypeService {
    45  				continue
    46  			}
    47  
    48  			if alloc.DeploymentStatus.HasHealth() {
    49  				continue
    50  			}
    51  			newAlloc := alloc.Copy()
    52  			newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{
    53  				Healthy:   helper.BoolToPtr(true),
    54  				Timestamp: now,
    55  			}
    56  			updates = append(updates, newAlloc)
    57  			logger.Printf("Marked deployment health for alloc %q", alloc.ID)
    58  		}
    59  
    60  		if len(updates) == 0 {
    61  			continue
    62  		}
    63  
    64  		// Send the update
    65  		req := &structs.AllocUpdateRequest{
    66  			Alloc:        updates,
    67  			WriteRequest: structs.WriteRequest{Region: "global"},
    68  		}
    69  		var resp structs.GenericResponse
    70  		if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil {
    71  			if ctx.Err() == context.Canceled {
    72  				return
    73  			} else if err != nil {
    74  				errCh <- err
    75  			}
    76  		}
    77  	}
    78  }
    79  
    80  // checkAllocPromoter is a small helper to return an error or nil from an error
    81  // chan like the one given to the allocPromoter goroutine.
    82  func checkAllocPromoter(errCh chan error) error {
    83  	select {
    84  	case err := <-errCh:
    85  		return err
    86  	default:
    87  		return nil
    88  	}
    89  }
    90  
    91  func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) {
    92  	resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx)
    93  	if err != nil {
    94  		return nil, 0, err
    95  	}
    96  	if err := ctx.Err(); err != nil {
    97  		return nil, 0, err
    98  	}
    99  
   100  	return resp.([]*structs.Allocation), index, nil
   101  }
   102  
   103  func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   104  	return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   105  		// Capture all the allocations
   106  		allocs, err := state.AllocsByNode(ws, nodeID)
   107  		if err != nil {
   108  			return nil, 0, err
   109  		}
   110  
   111  		// Use the last index that affected the jobs table
   112  		index, err := state.Index("allocs")
   113  		if err != nil {
   114  			return nil, index, err
   115  		}
   116  
   117  		return allocs, index, nil
   118  	}
   119  }
   120  
   121  func TestDrainer_Simple_ServiceOnly(t *testing.T) {
   122  	t.Parallel()
   123  	require := require.New(t)
   124  	s1 := TestServer(t, nil)
   125  	defer s1.Shutdown()
   126  	codec := rpcClient(t, s1)
   127  	testutil.WaitForLeader(t, s1.RPC)
   128  
   129  	// Create two nodes
   130  	n1, n2 := mock.Node(), mock.Node()
   131  	nodeReg := &structs.NodeRegisterRequest{
   132  		Node:         n1,
   133  		WriteRequest: structs.WriteRequest{Region: "global"},
   134  	}
   135  	var nodeResp structs.NodeUpdateResponse
   136  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   137  
   138  	// Create a job that runs on just one
   139  	job := mock.Job()
   140  	job.TaskGroups[0].Count = 2
   141  	req := &structs.JobRegisterRequest{
   142  		Job: job,
   143  		WriteRequest: structs.WriteRequest{
   144  			Region:    "global",
   145  			Namespace: job.Namespace,
   146  		},
   147  	}
   148  
   149  	// Fetch the response
   150  	var resp structs.JobRegisterResponse
   151  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   152  	require.NotZero(resp.Index)
   153  
   154  	// Wait for the two allocations to be placed
   155  	state := s1.State()
   156  	testutil.WaitForResult(func() (bool, error) {
   157  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   158  		if err != nil {
   159  			return false, err
   160  		}
   161  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   162  	}, func(err error) {
   163  		t.Fatalf("err: %v", err)
   164  	})
   165  
   166  	// Create the second node
   167  	nodeReg = &structs.NodeRegisterRequest{
   168  		Node:         n2,
   169  		WriteRequest: structs.WriteRequest{Region: "global"},
   170  	}
   171  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   172  
   173  	// Drain the first node
   174  	drainReq := &structs.NodeUpdateDrainRequest{
   175  		NodeID: n1.ID,
   176  		DrainStrategy: &structs.DrainStrategy{
   177  			DrainSpec: structs.DrainSpec{
   178  				Deadline: 10 * time.Minute,
   179  			},
   180  		},
   181  		WriteRequest: structs.WriteRequest{Region: "global"},
   182  	}
   183  	var drainResp structs.NodeDrainUpdateResponse
   184  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   185  
   186  	// Wait for the allocs to be replaced
   187  	errCh := make(chan error, 2)
   188  	ctx, cancel := context.WithCancel(context.Background())
   189  	defer cancel()
   190  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   191  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   192  
   193  	testutil.WaitForResult(func() (bool, error) {
   194  		allocs, err := state.AllocsByNode(nil, n2.ID)
   195  		if err != nil {
   196  			return false, err
   197  		}
   198  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   199  	}, func(err error) {
   200  		t.Fatalf("err: %v", err)
   201  	})
   202  
   203  	// Check that the node drain is removed
   204  	testutil.WaitForResult(func() (bool, error) {
   205  		if err := checkAllocPromoter(errCh); err != nil {
   206  			return false, err
   207  		}
   208  		node, err := state.NodeByID(nil, n1.ID)
   209  		if err != nil {
   210  			return false, err
   211  		}
   212  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   213  	}, func(err error) {
   214  		t.Fatalf("err: %v", err)
   215  	})
   216  
   217  	// Check we got the right events
   218  	node, err := state.NodeByID(nil, n1.ID)
   219  	require.NoError(err)
   220  	require.Len(node.Events, 3)
   221  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   222  }
   223  
   224  func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) {
   225  	t.Parallel()
   226  	require := require.New(t)
   227  	s1 := TestServer(t, nil)
   228  	defer s1.Shutdown()
   229  	codec := rpcClient(t, s1)
   230  	testutil.WaitForLeader(t, s1.RPC)
   231  
   232  	// Create a node
   233  	n1 := mock.Node()
   234  	nodeReg := &structs.NodeRegisterRequest{
   235  		Node:         n1,
   236  		WriteRequest: structs.WriteRequest{Region: "global"},
   237  	}
   238  	var nodeResp structs.NodeUpdateResponse
   239  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   240  
   241  	// Create a job that runs on just one
   242  	job := mock.Job()
   243  	job.Update = *structs.DefaultUpdateStrategy
   244  	job.Update.Stagger = 30 * time.Second
   245  	job.TaskGroups[0].Count = 2
   246  	req := &structs.JobRegisterRequest{
   247  		Job: job,
   248  		WriteRequest: structs.WriteRequest{
   249  			Region:    "global",
   250  			Namespace: job.Namespace,
   251  		},
   252  	}
   253  
   254  	// Fetch the response
   255  	var resp structs.JobRegisterResponse
   256  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   257  	require.NotZero(resp.Index)
   258  
   259  	// Wait for the two allocations to be placed
   260  	state := s1.State()
   261  	testutil.WaitForResult(func() (bool, error) {
   262  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   263  		if err != nil {
   264  			return false, err
   265  		}
   266  		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   267  	}, func(err error) {
   268  		t.Fatalf("err: %v", err)
   269  	})
   270  
   271  	// Drain the node
   272  	drainReq := &structs.NodeUpdateDrainRequest{
   273  		NodeID: n1.ID,
   274  		DrainStrategy: &structs.DrainStrategy{
   275  			DrainSpec: structs.DrainSpec{
   276  				Deadline: 1 * time.Second,
   277  			},
   278  		},
   279  		WriteRequest: structs.WriteRequest{Region: "global"},
   280  	}
   281  	var drainResp structs.NodeDrainUpdateResponse
   282  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   283  
   284  	// Wait for the allocs to be stopped
   285  	testutil.WaitForResult(func() (bool, error) {
   286  		allocs, err := state.AllocsByNode(nil, n1.ID)
   287  		if err != nil {
   288  			return false, err
   289  		}
   290  		for _, alloc := range allocs {
   291  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   292  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   293  			}
   294  		}
   295  		return true, nil
   296  	}, func(err error) {
   297  		t.Fatalf("err: %v", err)
   298  	})
   299  
   300  	// Check that the node drain is removed
   301  	testutil.WaitForResult(func() (bool, error) {
   302  		node, err := state.NodeByID(nil, n1.ID)
   303  		if err != nil {
   304  			return false, err
   305  		}
   306  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   307  	}, func(err error) {
   308  		t.Fatalf("err: %v", err)
   309  	})
   310  
   311  	// Check we got the right events
   312  	node, err := state.NodeByID(nil, n1.ID)
   313  	require.NoError(err)
   314  	require.Len(node.Events, 3)
   315  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   316  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   317  }
   318  
   319  func TestDrainer_DrainEmptyNode(t *testing.T) {
   320  	t.Parallel()
   321  	require := require.New(t)
   322  	s1 := TestServer(t, nil)
   323  	defer s1.Shutdown()
   324  	codec := rpcClient(t, s1)
   325  	testutil.WaitForLeader(t, s1.RPC)
   326  
   327  	// Create a node
   328  	n1 := mock.Node()
   329  	nodeReg := &structs.NodeRegisterRequest{
   330  		Node:         n1,
   331  		WriteRequest: structs.WriteRequest{Region: "global"},
   332  	}
   333  	var nodeResp structs.NodeUpdateResponse
   334  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   335  
   336  	// Drain the node
   337  	drainReq := &structs.NodeUpdateDrainRequest{
   338  		NodeID: n1.ID,
   339  		DrainStrategy: &structs.DrainStrategy{
   340  			DrainSpec: structs.DrainSpec{
   341  				Deadline: 10 * time.Minute,
   342  			},
   343  		},
   344  		WriteRequest: structs.WriteRequest{Region: "global"},
   345  	}
   346  	var drainResp structs.NodeDrainUpdateResponse
   347  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   348  
   349  	// Check that the node drain is removed
   350  	state := s1.State()
   351  	testutil.WaitForResult(func() (bool, error) {
   352  		node, err := state.NodeByID(nil, n1.ID)
   353  		if err != nil {
   354  			return false, err
   355  		}
   356  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   357  	}, func(err error) {
   358  		t.Fatalf("err: %v", err)
   359  	})
   360  
   361  	// Check we got the right events
   362  	node, err := state.NodeByID(nil, n1.ID)
   363  	require.NoError(err)
   364  	require.Len(node.Events, 3)
   365  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   366  }
   367  
   368  func TestDrainer_AllTypes_Deadline(t *testing.T) {
   369  	t.Parallel()
   370  	require := require.New(t)
   371  	s1 := TestServer(t, nil)
   372  	defer s1.Shutdown()
   373  	codec := rpcClient(t, s1)
   374  	testutil.WaitForLeader(t, s1.RPC)
   375  
   376  	// Create two nodes, registering the second later
   377  	n1, n2 := mock.Node(), mock.Node()
   378  	nodeReg := &structs.NodeRegisterRequest{
   379  		Node:         n1,
   380  		WriteRequest: structs.WriteRequest{Region: "global"},
   381  	}
   382  	var nodeResp structs.NodeUpdateResponse
   383  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   384  
   385  	// Create a service job that runs on just one
   386  	job := mock.Job()
   387  	job.TaskGroups[0].Count = 2
   388  	req := &structs.JobRegisterRequest{
   389  		Job: job,
   390  		WriteRequest: structs.WriteRequest{
   391  			Region:    "global",
   392  			Namespace: job.Namespace,
   393  		},
   394  	}
   395  
   396  	// Fetch the response
   397  	var resp structs.JobRegisterResponse
   398  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   399  	require.NotZero(resp.Index)
   400  
   401  	// Create a system job
   402  	sysjob := mock.SystemJob()
   403  	req = &structs.JobRegisterRequest{
   404  		Job: sysjob,
   405  		WriteRequest: structs.WriteRequest{
   406  			Region:    "global",
   407  			Namespace: job.Namespace,
   408  		},
   409  	}
   410  
   411  	// Fetch the response
   412  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   413  	require.NotZero(resp.Index)
   414  
   415  	// Create a batch job
   416  	bjob := mock.BatchJob()
   417  	bjob.TaskGroups[0].Count = 2
   418  	req = &structs.JobRegisterRequest{
   419  		Job: bjob,
   420  		WriteRequest: structs.WriteRequest{
   421  			Region:    "global",
   422  			Namespace: job.Namespace,
   423  		},
   424  	}
   425  
   426  	// Fetch the response
   427  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   428  	require.NotZero(resp.Index)
   429  
   430  	// Wait for the allocations to be placed
   431  	state := s1.State()
   432  	testutil.WaitForResult(func() (bool, error) {
   433  		allocs, err := state.AllocsByNode(nil, n1.ID)
   434  		if err != nil {
   435  			return false, err
   436  		}
   437  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   438  	}, func(err error) {
   439  		t.Fatalf("err: %v", err)
   440  	})
   441  
   442  	// Create the second node
   443  	nodeReg = &structs.NodeRegisterRequest{
   444  		Node:         n2,
   445  		WriteRequest: structs.WriteRequest{Region: "global"},
   446  	}
   447  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   448  
   449  	// Drain the node
   450  	drainReq := &structs.NodeUpdateDrainRequest{
   451  		NodeID: n1.ID,
   452  		DrainStrategy: &structs.DrainStrategy{
   453  			DrainSpec: structs.DrainSpec{
   454  				Deadline: 2 * time.Second,
   455  			},
   456  		},
   457  		WriteRequest: structs.WriteRequest{Region: "global"},
   458  	}
   459  	var drainResp structs.NodeDrainUpdateResponse
   460  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   461  
   462  	// Wait for the allocs to be replaced
   463  	errCh := make(chan error, 2)
   464  	ctx, cancel := context.WithCancel(context.Background())
   465  	defer cancel()
   466  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   467  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   468  
   469  	// Wait for the allocs to be stopped
   470  	var finalAllocs []*structs.Allocation
   471  	testutil.WaitForResult(func() (bool, error) {
   472  		if err := checkAllocPromoter(errCh); err != nil {
   473  			return false, err
   474  		}
   475  
   476  		var err error
   477  		finalAllocs, err = state.AllocsByNode(nil, n1.ID)
   478  		if err != nil {
   479  			return false, err
   480  		}
   481  		for _, alloc := range finalAllocs {
   482  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   483  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   484  			}
   485  		}
   486  		return true, nil
   487  	}, func(err error) {
   488  		t.Fatalf("err: %v", err)
   489  	})
   490  
   491  	// Check that the node drain is removed
   492  	testutil.WaitForResult(func() (bool, error) {
   493  		node, err := state.NodeByID(nil, n1.ID)
   494  		if err != nil {
   495  			return false, err
   496  		}
   497  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   498  	}, func(err error) {
   499  		t.Fatalf("err: %v", err)
   500  	})
   501  
   502  	// Wait for the allocations to be placed on the other node
   503  	testutil.WaitForResult(func() (bool, error) {
   504  		allocs, err := state.AllocsByNode(nil, n2.ID)
   505  		if err != nil {
   506  			return false, err
   507  		}
   508  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   509  	}, func(err error) {
   510  		t.Fatalf("err: %v", err)
   511  	})
   512  
   513  	// Assert that the service finished before the batch and system
   514  	var serviceMax, batchMax uint64 = 0, 0
   515  	for _, alloc := range finalAllocs {
   516  		if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax {
   517  			serviceMax = alloc.ModifyIndex
   518  		} else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax {
   519  			batchMax = alloc.ModifyIndex
   520  		}
   521  	}
   522  	require.True(serviceMax < batchMax)
   523  
   524  	// Check we got the right events
   525  	node, err := state.NodeByID(nil, n1.ID)
   526  	require.NoError(err)
   527  	require.Len(node.Events, 3)
   528  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   529  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   530  }
   531  
   532  // Test that drain is unset when batch jobs naturally finish
   533  func TestDrainer_AllTypes_NoDeadline(t *testing.T) {
   534  	t.Parallel()
   535  	require := require.New(t)
   536  	s1 := TestServer(t, nil)
   537  	defer s1.Shutdown()
   538  	codec := rpcClient(t, s1)
   539  	testutil.WaitForLeader(t, s1.RPC)
   540  
   541  	// Create two nodes, registering the second later
   542  	n1, n2 := mock.Node(), mock.Node()
   543  	nodeReg := &structs.NodeRegisterRequest{
   544  		Node:         n1,
   545  		WriteRequest: structs.WriteRequest{Region: "global"},
   546  	}
   547  	var nodeResp structs.NodeUpdateResponse
   548  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   549  
   550  	// Create a service job that runs on just one
   551  	job := mock.Job()
   552  	job.TaskGroups[0].Count = 2
   553  	req := &structs.JobRegisterRequest{
   554  		Job: job,
   555  		WriteRequest: structs.WriteRequest{
   556  			Region:    "global",
   557  			Namespace: job.Namespace,
   558  		},
   559  	}
   560  
   561  	// Fetch the response
   562  	var resp structs.JobRegisterResponse
   563  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   564  	require.NotZero(resp.Index)
   565  
   566  	// Create a system job
   567  	sysjob := mock.SystemJob()
   568  	req = &structs.JobRegisterRequest{
   569  		Job: sysjob,
   570  		WriteRequest: structs.WriteRequest{
   571  			Region:    "global",
   572  			Namespace: job.Namespace,
   573  		},
   574  	}
   575  
   576  	// Fetch the response
   577  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   578  	require.NotZero(resp.Index)
   579  
   580  	// Create a batch job
   581  	bjob := mock.BatchJob()
   582  	bjob.TaskGroups[0].Count = 2
   583  	req = &structs.JobRegisterRequest{
   584  		Job: bjob,
   585  		WriteRequest: structs.WriteRequest{
   586  			Region:    "global",
   587  			Namespace: job.Namespace,
   588  		},
   589  	}
   590  
   591  	// Fetch the response
   592  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   593  	require.NotZero(resp.Index)
   594  
   595  	// Wait for the allocations to be placed
   596  	state := s1.State()
   597  	testutil.WaitForResult(func() (bool, error) {
   598  		allocs, err := state.AllocsByNode(nil, n1.ID)
   599  		if err != nil {
   600  			return false, err
   601  		}
   602  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   603  	}, func(err error) {
   604  		t.Fatalf("err: %v", err)
   605  	})
   606  
   607  	// Create the second node
   608  	nodeReg = &structs.NodeRegisterRequest{
   609  		Node:         n2,
   610  		WriteRequest: structs.WriteRequest{Region: "global"},
   611  	}
   612  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   613  
   614  	// Drain the node
   615  	drainReq := &structs.NodeUpdateDrainRequest{
   616  		NodeID: n1.ID,
   617  		DrainStrategy: &structs.DrainStrategy{
   618  			DrainSpec: structs.DrainSpec{
   619  				Deadline: 0 * time.Second, // Infinite
   620  			},
   621  		},
   622  		WriteRequest: structs.WriteRequest{Region: "global"},
   623  	}
   624  	var drainResp structs.NodeDrainUpdateResponse
   625  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   626  
   627  	// Wait for the allocs to be replaced
   628  	errCh := make(chan error, 2)
   629  	ctx, cancel := context.WithCancel(context.Background())
   630  	defer cancel()
   631  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   632  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   633  
   634  	// Wait for the service allocs to be stopped on the draining node
   635  	testutil.WaitForResult(func() (bool, error) {
   636  		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
   637  		if err != nil {
   638  			return false, err
   639  		}
   640  		for _, alloc := range allocs {
   641  			if alloc.NodeID != n1.ID {
   642  				continue
   643  			}
   644  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   645  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   646  			}
   647  		}
   648  		if err := checkAllocPromoter(errCh); err != nil {
   649  			return false, err
   650  		}
   651  		return true, nil
   652  	}, func(err error) {
   653  		t.Fatalf("err: %v", err)
   654  	})
   655  
   656  	// Mark the batch allocations as finished
   657  	allocs, err := state.AllocsByJob(nil, job.Namespace, bjob.ID, false)
   658  	require.Nil(err)
   659  
   660  	var updates []*structs.Allocation
   661  	for _, alloc := range allocs {
   662  		new := alloc.Copy()
   663  		new.ClientStatus = structs.AllocClientStatusComplete
   664  		updates = append(updates, new)
   665  	}
   666  	require.Nil(state.UpdateAllocsFromClient(1000, updates))
   667  
   668  	// Check that the node drain is removed
   669  	testutil.WaitForResult(func() (bool, error) {
   670  		node, err := state.NodeByID(nil, n1.ID)
   671  		if err != nil {
   672  			return false, err
   673  		}
   674  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   675  	}, func(err error) {
   676  		t.Fatalf("err: %v", err)
   677  	})
   678  
   679  	// Wait for the service allocations to be placed on the other node
   680  	testutil.WaitForResult(func() (bool, error) {
   681  		allocs, err := state.AllocsByNode(nil, n2.ID)
   682  		if err != nil {
   683  			return false, err
   684  		}
   685  		return len(allocs) == 3, fmt.Errorf("got %d allocs", len(allocs))
   686  	}, func(err error) {
   687  		t.Fatalf("err: %v", err)
   688  	})
   689  
   690  	// Check we got the right events
   691  	node, err := state.NodeByID(nil, n1.ID)
   692  	require.NoError(err)
   693  	require.Len(node.Events, 3)
   694  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   695  }
   696  
   697  func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) {
   698  	t.Parallel()
   699  	require := require.New(t)
   700  	s1 := TestServer(t, nil)
   701  	defer s1.Shutdown()
   702  	codec := rpcClient(t, s1)
   703  	testutil.WaitForLeader(t, s1.RPC)
   704  
   705  	// Create two nodes, registering the second later
   706  	n1, n2 := mock.Node(), mock.Node()
   707  	nodeReg := &structs.NodeRegisterRequest{
   708  		Node:         n1,
   709  		WriteRequest: structs.WriteRequest{Region: "global"},
   710  	}
   711  	var nodeResp structs.NodeUpdateResponse
   712  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   713  
   714  	// Create a service job that runs on just one
   715  	job := mock.Job()
   716  	job.TaskGroups[0].Count = 2
   717  	req := &structs.JobRegisterRequest{
   718  		Job: job,
   719  		WriteRequest: structs.WriteRequest{
   720  			Region:    "global",
   721  			Namespace: job.Namespace,
   722  		},
   723  	}
   724  
   725  	// Fetch the response
   726  	var resp structs.JobRegisterResponse
   727  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   728  	require.NotZero(resp.Index)
   729  	job.CreateIndex = resp.JobModifyIndex
   730  
   731  	// Create a system job
   732  	sysjob := mock.SystemJob()
   733  	req = &structs.JobRegisterRequest{
   734  		Job: sysjob,
   735  		WriteRequest: structs.WriteRequest{
   736  			Region:    "global",
   737  			Namespace: job.Namespace,
   738  		},
   739  	}
   740  
   741  	// Fetch the response
   742  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   743  	require.NotZero(resp.Index)
   744  	sysjob.CreateIndex = resp.JobModifyIndex
   745  
   746  	// Create a batch job
   747  	bjob := mock.BatchJob()
   748  	bjob.TaskGroups[0].Count = 2
   749  	req = &structs.JobRegisterRequest{
   750  		Job: bjob,
   751  		WriteRequest: structs.WriteRequest{
   752  			Region:    "global",
   753  			Namespace: job.Namespace,
   754  		},
   755  	}
   756  
   757  	// Fetch the response
   758  	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   759  	require.NotZero(resp.Index)
   760  	bjob.CreateIndex = resp.JobModifyIndex
   761  
   762  	// Wait for the allocations to be placed
   763  	state := s1.State()
   764  	testutil.WaitForResult(func() (bool, error) {
   765  		allocs, err := state.AllocsByNode(nil, n1.ID)
   766  		if err != nil {
   767  			return false, err
   768  		}
   769  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   770  	}, func(err error) {
   771  		t.Fatalf("err: %v", err)
   772  	})
   773  
   774  	// Create some old terminal allocs for each job that point at a non-existent
   775  	// node to simulate it being on a GC'd node.
   776  	var badAllocs []*structs.Allocation
   777  	for _, job := range []*structs.Job{job, sysjob, bjob} {
   778  		alloc := mock.Alloc()
   779  		alloc.Namespace = job.Namespace
   780  		alloc.Job = job
   781  		alloc.JobID = job.ID
   782  		alloc.NodeID = uuid.Generate()
   783  		alloc.TaskGroup = job.TaskGroups[0].Name
   784  		alloc.DesiredStatus = structs.AllocDesiredStatusStop
   785  		alloc.ClientStatus = structs.AllocClientStatusComplete
   786  		badAllocs = append(badAllocs, alloc)
   787  	}
   788  	require.NoError(state.UpsertAllocs(1, badAllocs))
   789  
   790  	// Create the second node
   791  	nodeReg = &structs.NodeRegisterRequest{
   792  		Node:         n2,
   793  		WriteRequest: structs.WriteRequest{Region: "global"},
   794  	}
   795  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   796  
   797  	// Drain the node
   798  	drainReq := &structs.NodeUpdateDrainRequest{
   799  		NodeID: n1.ID,
   800  		DrainStrategy: &structs.DrainStrategy{
   801  			DrainSpec: structs.DrainSpec{
   802  				Deadline: 2 * time.Second,
   803  			},
   804  		},
   805  		WriteRequest: structs.WriteRequest{Region: "global"},
   806  	}
   807  	var drainResp structs.NodeDrainUpdateResponse
   808  	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   809  
   810  	// Wait for the allocs to be replaced
   811  	errCh := make(chan error, 2)
   812  	ctx, cancel := context.WithCancel(context.Background())
   813  	defer cancel()
   814  	go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   815  	go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger)
   816  
   817  	// Wait for the allocs to be stopped
   818  	var finalAllocs []*structs.Allocation
   819  	testutil.WaitForResult(func() (bool, error) {
   820  		if err := checkAllocPromoter(errCh); err != nil {
   821  			return false, err
   822  		}
   823  
   824  		var err error
   825  		finalAllocs, err = state.AllocsByNode(nil, n1.ID)
   826  		if err != nil {
   827  			return false, err
   828  		}
   829  		for _, alloc := range finalAllocs {
   830  			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   831  				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
   832  			}
   833  		}
   834  		return true, nil
   835  	}, func(err error) {
   836  		t.Fatalf("err: %v", err)
   837  	})
   838  
   839  	// Check that the node drain is removed
   840  	testutil.WaitForResult(func() (bool, error) {
   841  		node, err := state.NodeByID(nil, n1.ID)
   842  		if err != nil {
   843  			return false, err
   844  		}
   845  		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
   846  	}, func(err error) {
   847  		t.Fatalf("err: %v", err)
   848  	})
   849  
   850  	// Wait for the allocations to be placed on the other node
   851  	testutil.WaitForResult(func() (bool, error) {
   852  		allocs, err := state.AllocsByNode(nil, n2.ID)
   853  		if err != nil {
   854  			return false, err
   855  		}
   856  		return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs))
   857  	}, func(err error) {
   858  		t.Fatalf("err: %v", err)
   859  	})
   860  
   861  	// Check we got the right events
   862  	node, err := state.NodeByID(nil, n1.ID)
   863  	require.NoError(err)
   864  	require.Len(node.Events, 3)
   865  	require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message)
   866  	require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined)
   867  }
   868  
   869  // Test that transitions to force drain work.
   870  func TestDrainer_Batch_TransitionToForce(t *testing.T) {
   871  	t.Parallel()
   872  	require := require.New(t)
   873  
   874  	for _, inf := range []bool{true, false} {
   875  		name := "Infinite"
   876  		if !inf {
   877  			name = "Deadline"
   878  		}
   879  		t.Run(name, func(t *testing.T) {
   880  			s1 := TestServer(t, nil)
   881  			defer s1.Shutdown()
   882  			codec := rpcClient(t, s1)
   883  			testutil.WaitForLeader(t, s1.RPC)
   884  
   885  			// Create a node
   886  			n1 := mock.Node()
   887  			nodeReg := &structs.NodeRegisterRequest{
   888  				Node:         n1,
   889  				WriteRequest: structs.WriteRequest{Region: "global"},
   890  			}
   891  			var nodeResp structs.NodeUpdateResponse
   892  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
   893  
   894  			// Create a batch job
   895  			bjob := mock.BatchJob()
   896  			bjob.TaskGroups[0].Count = 2
   897  			req := &structs.JobRegisterRequest{
   898  				Job: bjob,
   899  				WriteRequest: structs.WriteRequest{
   900  					Region:    "global",
   901  					Namespace: bjob.Namespace,
   902  				},
   903  			}
   904  
   905  			// Fetch the response
   906  			var resp structs.JobRegisterResponse
   907  			require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
   908  			require.NotZero(resp.Index)
   909  
   910  			// Wait for the allocations to be placed
   911  			state := s1.State()
   912  			testutil.WaitForResult(func() (bool, error) {
   913  				allocs, err := state.AllocsByNode(nil, n1.ID)
   914  				if err != nil {
   915  					return false, err
   916  				}
   917  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   918  			}, func(err error) {
   919  				t.Fatalf("err: %v", err)
   920  			})
   921  
   922  			// Pick the deadline
   923  			deadline := 0 * time.Second
   924  			if !inf {
   925  				deadline = 10 * time.Second
   926  			}
   927  
   928  			// Drain the node
   929  			drainReq := &structs.NodeUpdateDrainRequest{
   930  				NodeID: n1.ID,
   931  				DrainStrategy: &structs.DrainStrategy{
   932  					DrainSpec: structs.DrainSpec{
   933  						Deadline: deadline,
   934  					},
   935  				},
   936  				WriteRequest: structs.WriteRequest{Region: "global"},
   937  			}
   938  			var drainResp structs.NodeDrainUpdateResponse
   939  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   940  
   941  			// Wait for the allocs to be replaced
   942  			errCh := make(chan error, 1)
   943  			ctx, cancel := context.WithCancel(context.Background())
   944  			defer cancel()
   945  			go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger)
   946  
   947  			// Make sure the batch job isn't affected
   948  			testutil.AssertUntil(500*time.Millisecond, func() (bool, error) {
   949  				if err := checkAllocPromoter(errCh); err != nil {
   950  					return false, err
   951  				}
   952  
   953  				allocs, err := state.AllocsByNode(nil, n1.ID)
   954  				if err != nil {
   955  					return false, err
   956  				}
   957  				for _, alloc := range allocs {
   958  					if alloc.DesiredStatus != structs.AllocDesiredStatusRun {
   959  						return false, fmt.Errorf("got status %v", alloc.DesiredStatus)
   960  					}
   961  				}
   962  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   963  			}, func(err error) {
   964  				t.Fatalf("err: %v", err)
   965  			})
   966  
   967  			// Foce drain the node
   968  			drainReq = &structs.NodeUpdateDrainRequest{
   969  				NodeID: n1.ID,
   970  				DrainStrategy: &structs.DrainStrategy{
   971  					DrainSpec: structs.DrainSpec{
   972  						Deadline: -1 * time.Second, // Infinite
   973  					},
   974  				},
   975  				WriteRequest: structs.WriteRequest{Region: "global"},
   976  			}
   977  			require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
   978  
   979  			// Make sure the batch job is migrated
   980  			testutil.WaitForResult(func() (bool, error) {
   981  				allocs, err := state.AllocsByNode(nil, n1.ID)
   982  				if err != nil {
   983  					return false, err
   984  				}
   985  				for _, alloc := range allocs {
   986  					if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
   987  						return false, fmt.Errorf("got status %v", alloc.DesiredStatus)
   988  					}
   989  				}
   990  				return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
   991  			}, func(err error) {
   992  				t.Fatalf("err: %v", err)
   993  			})
   994  
   995  			// Check that the node drain is removed
   996  			testutil.WaitForResult(func() (bool, error) {
   997  				node, err := state.NodeByID(nil, n1.ID)
   998  				if err != nil {
   999  					return false, err
  1000  				}
  1001  				return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
  1002  			}, func(err error) {
  1003  				t.Fatalf("err: %v", err)
  1004  			})
  1005  
  1006  			// Check we got the right events
  1007  			node, err := state.NodeByID(nil, n1.ID)
  1008  			require.NoError(err)
  1009  			require.Len(node.Events, 4)
  1010  			require.Equal(drainer.NodeDrainEventComplete, node.Events[3].Message)
  1011  			require.Contains(node.Events[3].Details, drainer.NodeDrainEventDetailDeadlined)
  1012  		})
  1013  	}
  1014  }