github.com/hernad/nomad@v1.6.112/nomad/drainer/watch_jobs_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package drainer
     5  
     6  import (
     7  	"context"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/shoenig/test"
    12  	"github.com/shoenig/test/must"
    13  	"github.com/stretchr/testify/require"
    14  	"golang.org/x/time/rate"
    15  
    16  	"github.com/hernad/nomad/ci"
    17  	"github.com/hernad/nomad/helper/pointer"
    18  	"github.com/hernad/nomad/helper/testlog"
    19  	"github.com/hernad/nomad/helper/uuid"
    20  	"github.com/hernad/nomad/nomad/mock"
    21  	"github.com/hernad/nomad/nomad/state"
    22  	"github.com/hernad/nomad/nomad/structs"
    23  )
    24  
    25  func testNodes(t *testing.T, state *state.StateStore) (drainingNode, runningNode *structs.Node) {
    26  	n1 := mock.Node()
    27  	n1.Name = "draining"
    28  	n1.DrainStrategy = &structs.DrainStrategy{
    29  		DrainSpec: structs.DrainSpec{
    30  			Deadline: time.Minute,
    31  		},
    32  		ForceDeadline: time.Now().Add(time.Minute),
    33  	}
    34  	require.Nil(t, state.UpsertNode(structs.MsgTypeTestSetup, 100, n1))
    35  
    36  	// Create a non-draining node
    37  	n2 := mock.Node()
    38  	n2.Name = "running"
    39  	require.Nil(t, state.UpsertNode(structs.MsgTypeTestSetup, 101, n2))
    40  	return n1, n2
    41  }
    42  
    43  func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJobWatcher, context.CancelFunc) {
    44  	t.Helper()
    45  
    46  	limiter := rate.NewLimiter(100.0, 100)
    47  	logger := testlog.HCLogger(t)
    48  	ctx, cancel := context.WithCancel(context.Background())
    49  	w := NewDrainingJobWatcher(ctx, limiter, state, logger)
    50  	return w, cancel
    51  }
    52  
    53  // TestDrainingJobWatcher_Interface is a compile-time assertion that we
    54  // implement the intended interface.
    55  func TestDrainingJobWatcher_Interface(t *testing.T) {
    56  	ci.Parallel(t)
    57  
    58  	w, cancel := testDrainingJobWatcher(t, state.TestStateStore(t))
    59  	cancel()
    60  	var _ DrainingJobWatcher = w
    61  }
    62  
    63  // asertJobWatcherOps asserts a certain number of allocs are drained and/or
    64  // migrated by the job watcher.
    65  func assertJobWatcherOps(t *testing.T, jw DrainingJobWatcher, drained, migrated int) (
    66  	*DrainRequest, []*structs.Allocation) {
    67  	t.Helper()
    68  	var (
    69  		drains                           *DrainRequest
    70  		migrations                       []*structs.Allocation
    71  		drainsChecked, migrationsChecked bool
    72  	)
    73  	for {
    74  		select {
    75  		case drains = <-jw.Drain():
    76  			ids := make([]string, len(drains.Allocs))
    77  			for i, a := range drains.Allocs {
    78  				ids[i] = a.JobID[:6] + ":" + a.ID[:6]
    79  			}
    80  			t.Logf("draining %d allocs: %v", len(ids), ids)
    81  			require.False(t, drainsChecked, "drains already received")
    82  			drainsChecked = true
    83  			require.Lenf(t, drains.Allocs, drained,
    84  				"expected %d drains but found %d", drained, len(drains.Allocs))
    85  		case migrations = <-jw.Migrated():
    86  			ids := make([]string, len(migrations))
    87  			for i, a := range migrations {
    88  				ids[i] = a.JobID[:6] + ":" + a.ID[:6]
    89  			}
    90  			t.Logf("migrating %d allocs: %v", len(ids), ids)
    91  			require.False(t, migrationsChecked, "migrations already received")
    92  			migrationsChecked = true
    93  			require.Lenf(t, migrations, migrated,
    94  				"expected %d migrations but found %d", migrated, len(migrations))
    95  		case <-time.After(10 * time.Millisecond):
    96  			if !drainsChecked && drained > 0 {
    97  				t.Fatalf("expected %d drains but none happened", drained)
    98  			}
    99  			if !migrationsChecked && migrated > 0 {
   100  				t.Fatalf("expected %d migrations but none happened", migrated)
   101  			}
   102  			return drains, migrations
   103  		}
   104  	}
   105  }
   106  
   107  // TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches
   108  // allocation changes from multiple jobs.
   109  func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
   110  	ci.Parallel(t)
   111  
   112  	store := state.TestStateStore(t)
   113  	jobWatcher, cancelWatcher := testDrainingJobWatcher(t, store)
   114  	defer cancelWatcher()
   115  	drainingNode, runningNode := testNodes(t, store)
   116  
   117  	var index uint64 = 101
   118  	count := 8
   119  
   120  	newAlloc := func(node *structs.Node, job *structs.Job) *structs.Allocation {
   121  		a := mock.Alloc()
   122  		a.JobID = job.ID
   123  		a.Job = job
   124  		a.TaskGroup = job.TaskGroups[0].Name
   125  		a.NodeID = node.ID
   126  		return a
   127  	}
   128  
   129  	// 2 jobs with count 10, max parallel 3
   130  	jnss := make([]structs.NamespacedID, 2)
   131  	jobs := make([]*structs.Job, 2)
   132  	for i := 0; i < 2; i++ {
   133  		job := mock.Job()
   134  		jobs[i] = job
   135  		jnss[i] = structs.NamespacedID{Namespace: job.Namespace, ID: job.ID}
   136  		job.TaskGroups[0].Migrate.MaxParallel = 3
   137  		job.TaskGroups[0].Count = count
   138  		must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, index, nil, job))
   139  		index++
   140  
   141  		var allocs []*structs.Allocation
   142  		for i := 0; i < count; i++ {
   143  			a := newAlloc(drainingNode, job)
   144  			a.DeploymentStatus = &structs.AllocDeploymentStatus{
   145  				Healthy: pointer.Of(true),
   146  			}
   147  			allocs = append(allocs, a)
   148  		}
   149  
   150  		must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, allocs))
   151  		index++
   152  
   153  	}
   154  
   155  	// Only register jobs with watcher after creating all data models as
   156  	// once the watcher starts we need to track the index carefully for
   157  	// updating the batch future
   158  	jobWatcher.RegisterJobs(jnss)
   159  
   160  	// Expect a first batch of MaxParallel allocs from each job
   161  	drains, _ := assertJobWatcherOps(t, jobWatcher, 6, 0)
   162  
   163  	// Fake migrating the drained allocs by starting new ones and stopping
   164  	// the old ones
   165  	drainedAllocs := make([]*structs.Allocation, len(drains.Allocs))
   166  	for i, a := range drains.Allocs {
   167  		a.DesiredTransition.Migrate = pointer.Of(true)
   168  
   169  		// create a copy so we can reuse this slice
   170  		drainedAllocs[i] = a.Copy()
   171  	}
   172  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
   173  	drains.Resp.Respond(index, nil)
   174  	index++
   175  
   176  	// Just setting ShouldMigrate should not cause any further drains
   177  	assertJobWatcherOps(t, jobWatcher, 0, 0)
   178  
   179  	// Proceed our fake migration along by creating new allocs and stopping
   180  	// old ones
   181  	replacements := make([]*structs.Allocation, len(drainedAllocs))
   182  	updates := make([]*structs.Allocation, 0, len(drainedAllocs)*2)
   183  	for i, a := range drainedAllocs {
   184  		// Stop drained allocs
   185  		a.DesiredTransition.Migrate = nil
   186  		a.DesiredStatus = structs.AllocDesiredStatusStop
   187  
   188  		// Create a replacement
   189  		replacement := mock.Alloc()
   190  		replacement.JobID = a.Job.ID
   191  		replacement.Job = a.Job
   192  		replacement.TaskGroup = a.TaskGroup
   193  		replacement.NodeID = runningNode.ID
   194  		// start in pending state with no health status
   195  
   196  		updates = append(updates, a, replacement)
   197  		replacements[i] = replacement.Copy()
   198  	}
   199  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
   200  	index++
   201  
   202  	// The drained allocs stopping cause migrations but no new drains
   203  	// because the replacements have not started
   204  	assertJobWatcherOps(t, jobWatcher, 0, 0)
   205  
   206  	// Client sends stop on these allocs
   207  	completeAllocs := make([]*structs.Allocation, len(drainedAllocs))
   208  	for i, a := range drainedAllocs {
   209  		a = a.Copy()
   210  		a.ClientStatus = structs.AllocClientStatusComplete
   211  		completeAllocs[i] = a
   212  	}
   213  	must.NoError(t, store.UpdateAllocsFromClient(structs.MsgTypeTestSetup, index, completeAllocs))
   214  	index++
   215  
   216  	// The drained allocs stopping cause migrations but no new drains
   217  	// because the replacements have not started
   218  	assertJobWatcherOps(t, jobWatcher, 0, 6)
   219  
   220  	// Finally kickoff further drain activity by "starting" replacements
   221  	for _, a := range replacements {
   222  		a.ClientStatus = structs.AllocClientStatusRunning
   223  		a.DeploymentStatus = &structs.AllocDeploymentStatus{
   224  			Healthy: pointer.Of(true),
   225  		}
   226  	}
   227  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
   228  	index++
   229  
   230  	must.MapNotEmpty(t, jobWatcher.drainingJobs())
   231  
   232  	// 6 new drains
   233  	drains, _ = assertJobWatcherOps(t, jobWatcher, 6, 0)
   234  
   235  	// Fake migrations once more to finish the drain
   236  	drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
   237  	for i, a := range drains.Allocs {
   238  		a.DesiredTransition.Migrate = pointer.Of(true)
   239  
   240  		// create a copy so we can reuse this slice
   241  		drainedAllocs[i] = a.Copy()
   242  	}
   243  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
   244  	drains.Resp.Respond(index, nil)
   245  	index++
   246  
   247  	assertJobWatcherOps(t, jobWatcher, 0, 0)
   248  
   249  	replacements = make([]*structs.Allocation, len(drainedAllocs))
   250  	updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
   251  	for i, a := range drainedAllocs {
   252  		a.DesiredTransition.Migrate = nil
   253  		a.DesiredStatus = structs.AllocDesiredStatusStop
   254  		a.ClientStatus = structs.AllocClientStatusComplete
   255  
   256  		replacement := newAlloc(runningNode, a.Job)
   257  		updates = append(updates, a, replacement)
   258  		replacements[i] = replacement.Copy()
   259  	}
   260  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
   261  	index++
   262  
   263  	assertJobWatcherOps(t, jobWatcher, 0, 6)
   264  
   265  	for _, a := range replacements {
   266  		a.ClientStatus = structs.AllocClientStatusRunning
   267  		a.DeploymentStatus = &structs.AllocDeploymentStatus{
   268  			Healthy: pointer.Of(true),
   269  		}
   270  	}
   271  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
   272  	index++
   273  
   274  	must.MapNotEmpty(t, jobWatcher.drainingJobs())
   275  
   276  	// Final 4 new drains
   277  	drains, _ = assertJobWatcherOps(t, jobWatcher, 4, 0)
   278  
   279  	// Fake migrations once more to finish the drain
   280  	drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
   281  	for i, a := range drains.Allocs {
   282  		a.DesiredTransition.Migrate = pointer.Of(true)
   283  
   284  		// create a copy so we can reuse this slice
   285  		drainedAllocs[i] = a.Copy()
   286  	}
   287  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, drainedAllocs))
   288  	drains.Resp.Respond(index, nil)
   289  	index++
   290  
   291  	assertJobWatcherOps(t, jobWatcher, 0, 0)
   292  
   293  	replacements = make([]*structs.Allocation, len(drainedAllocs))
   294  	updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
   295  	for i, a := range drainedAllocs {
   296  		a.DesiredTransition.Migrate = nil
   297  		a.DesiredStatus = structs.AllocDesiredStatusStop
   298  		a.ClientStatus = structs.AllocClientStatusComplete
   299  
   300  		replacement := newAlloc(runningNode, a.Job)
   301  		updates = append(updates, a, replacement)
   302  		replacements[i] = replacement.Copy()
   303  	}
   304  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, updates))
   305  	index++
   306  
   307  	assertJobWatcherOps(t, jobWatcher, 0, 4)
   308  
   309  	for _, a := range replacements {
   310  		a.ClientStatus = structs.AllocClientStatusRunning
   311  		a.DeploymentStatus = &structs.AllocDeploymentStatus{
   312  			Healthy: pointer.Of(true),
   313  		}
   314  	}
   315  	must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, replacements))
   316  
   317  	// No jobs should be left!
   318  	must.MapEmpty(t, jobWatcher.drainingJobs())
   319  }
   320  
   321  // TestDrainingJobWatcher_HandleTaskGroup tests that the watcher handles
   322  // allocation updates as expected.
   323  func TestDrainingJobWatcher_HandleTaskGroup(t *testing.T) {
   324  	ci.Parallel(t)
   325  
   326  	testCases := []struct {
   327  		name        string
   328  		batch       bool // use a batch job
   329  		allocCount  int  // number of allocs in test (defaults to 10)
   330  		maxParallel int  // max_parallel (defaults to 1)
   331  
   332  		// addAllocFn will be called allocCount times to create test allocs,
   333  		// and the allocs default to be healthy on the draining node
   334  		addAllocFn func(idx int, a *structs.Allocation, drainingID, runningID string)
   335  
   336  		expectDrained  int
   337  		expectMigrated int
   338  		expectDone     bool
   339  	}{
   340  		{
   341  			// all allocs on draining node, should respect max_parallel=1
   342  			name:           "drain-respects-max-parallel-1",
   343  			expectDrained:  1,
   344  			expectMigrated: 0,
   345  			expectDone:     false,
   346  		},
   347  		{
   348  			// allocs on a non-draining node, should not be drained
   349  			name:           "allocs-on-non-draining-node-should-not-drain",
   350  			expectDrained:  0,
   351  			expectMigrated: 0,
   352  			expectDone:     true,
   353  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   354  				a.NodeID = runningID
   355  			},
   356  		},
   357  		{
   358  			// even unhealthy allocs on a non-draining node should not be drained
   359  			name:           "unhealthy-allocs-on-non-draining-node-should-not-drain",
   360  			expectDrained:  0,
   361  			expectMigrated: 0,
   362  			expectDone:     false,
   363  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   364  				if i%2 == 0 {
   365  					a.NodeID = runningID
   366  					a.DeploymentStatus = nil
   367  				}
   368  			},
   369  		},
   370  		{
   371  			// only the alloc on draining node should be drained
   372  			name:           "healthy-alloc-draining-node-should-drain",
   373  			expectDrained:  1,
   374  			expectMigrated: 0,
   375  			expectDone:     false,
   376  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   377  				if i != 0 {
   378  					a.NodeID = runningID
   379  				}
   380  			},
   381  		},
   382  		{
   383  			// alloc that's still draining doesn't produce more result updates
   384  			name:           "still-draining-alloc-no-new-updates",
   385  			expectDrained:  0,
   386  			expectMigrated: 0,
   387  			expectDone:     false,
   388  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   389  				if i == 0 {
   390  					a.DesiredTransition.Migrate = pointer.Of(true)
   391  					return
   392  				}
   393  				a.NodeID = runningID
   394  			},
   395  		},
   396  		{
   397  			// alloc that's finished draining gets marked as migrated
   398  			name:           "client-terminal-alloc-drain-should-be-finished",
   399  			expectDrained:  0,
   400  			expectMigrated: 1,
   401  			expectDone:     true,
   402  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   403  				if i == 0 {
   404  					a.DesiredStatus = structs.AllocDesiredStatusStop
   405  					a.ClientStatus = structs.AllocClientStatusComplete
   406  					return
   407  				}
   408  				a.NodeID = runningID
   409  			},
   410  		},
   411  		{
   412  			// batch alloc that's finished draining gets marked as migrated
   413  			name:           "client-terminal-batch-alloc-drain-should-be-finished",
   414  			batch:          true,
   415  			expectDrained:  0,
   416  			expectMigrated: 1,
   417  			expectDone:     true,
   418  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   419  				if i == 0 {
   420  					a.DesiredStatus = structs.AllocDesiredStatusStop
   421  					a.ClientStatus = structs.AllocClientStatusComplete
   422  					return
   423  				}
   424  				a.NodeID = runningID
   425  			},
   426  		},
   427  		{
   428  			// all allocs are client-terminal, so nothing left to drain
   429  			name:           "all-client-terminal-drain-should-be-finished",
   430  			expectDrained:  0,
   431  			expectMigrated: 10,
   432  			expectDone:     true,
   433  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   434  				a.DesiredStatus = structs.AllocDesiredStatusStop
   435  				a.ClientStatus = structs.AllocClientStatusComplete
   436  			},
   437  		},
   438  		{
   439  			// all allocs are terminal, but only half are client-terminal
   440  			name:           "half-client-terminal-drain-should-not-be-finished",
   441  			expectDrained:  0,
   442  			expectMigrated: 5,
   443  			expectDone:     false,
   444  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   445  				a.DesiredStatus = structs.AllocDesiredStatusStop
   446  				if i%2 == 0 {
   447  					a.ClientStatus = structs.AllocClientStatusComplete
   448  				}
   449  			},
   450  		},
   451  		{
   452  			// All allocs are terminal, nothing to be drained
   453  			name:           "all-terminal-batch",
   454  			batch:          true,
   455  			expectDrained:  0,
   456  			expectMigrated: 10,
   457  			expectDone:     true,
   458  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   459  				a.DesiredStatus = structs.AllocDesiredStatusStop
   460  				a.ClientStatus = structs.AllocClientStatusComplete
   461  			},
   462  		},
   463  		{
   464  			// with max_parallel=10, all allocs can be drained at once
   465  			name:           "drain-respects-max-parallel-all-at-once",
   466  			expectDrained:  10,
   467  			expectMigrated: 0,
   468  			expectDone:     false,
   469  			maxParallel:    10,
   470  		},
   471  		{
   472  			// with max_parallel=2, up to 2 allocs can be drained at a time
   473  			name:           "drain-respects-max-parallel-2",
   474  			expectDrained:  2,
   475  			expectMigrated: 0,
   476  			expectDone:     false,
   477  			maxParallel:    2,
   478  		},
   479  		{
   480  			// with max_parallel=2, up to 2 allocs can be drained at a time but
   481  			// we haven't yet informed the drainer that 1 has completed
   482  			// migrating
   483  			name:           "notify-migrated-1-on-new-1-drained-1-draining",
   484  			expectDrained:  1,
   485  			expectMigrated: 1,
   486  			maxParallel:    2,
   487  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   488  				switch i {
   489  				case 0:
   490  					// One alloc on running node
   491  					a.NodeID = runningID
   492  				case 1:
   493  					// One alloc already migrated
   494  					a.DesiredStatus = structs.AllocDesiredStatusStop
   495  					a.ClientStatus = structs.AllocClientStatusComplete
   496  				}
   497  			},
   498  		},
   499  		{
   500  			// with max_parallel=2, up to 2 allocs can be drained at a time but
   501  			// we haven't yet informed the drainer that 1 has completed
   502  			// migrating
   503  			name:           "notify-migrated-8-on-new-1-drained-1-draining",
   504  			expectDrained:  1,
   505  			expectMigrated: 1,
   506  			maxParallel:    2,
   507  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   508  				switch i {
   509  				case 0, 1, 2, 3, 4, 5, 6, 7:
   510  					a.NodeID = runningID
   511  				case 8:
   512  					a.DesiredStatus = structs.AllocDesiredStatusStop
   513  					a.ClientStatus = structs.AllocClientStatusComplete
   514  				}
   515  			},
   516  		},
   517  		{
   518  			// 5 on new node, two drained, and three draining
   519  			// with max_parallel=5, up to 5 allocs can be drained at a time but
   520  			// we haven't yet informed the drainer that 2 have completed
   521  			// migrating
   522  			name:           "notify-migrated-5-on-new-2-drained-3-draining",
   523  			expectDrained:  3,
   524  			expectMigrated: 2,
   525  			maxParallel:    5,
   526  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   527  				switch i {
   528  				case 0, 1, 2, 3, 4:
   529  					a.NodeID = runningID
   530  				case 8, 9:
   531  					a.DesiredStatus = structs.AllocDesiredStatusStop
   532  					a.ClientStatus = structs.AllocClientStatusComplete
   533  				}
   534  			},
   535  		},
   536  		{
   537  			// half the allocs have been moved to the new node but 1 doesn't
   538  			// have health set yet, so we should have MaxParallel - 1 in flight
   539  			name:           "pending-health-blocks",
   540  			expectDrained:  1,
   541  			expectMigrated: 1,
   542  			maxParallel:    3,
   543  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   544  				switch i {
   545  				case 0:
   546  					// Deployment status UNset for 1 on new node
   547  					a.NodeID = runningID
   548  					a.DeploymentStatus = nil
   549  				case 1, 2, 3, 4:
   550  					// Deployment status set for 4 on new node
   551  					a.NodeID = runningID
   552  				case 9:
   553  					a.DesiredStatus = structs.AllocDesiredStatusStop
   554  					a.ClientStatus = structs.AllocClientStatusComplete
   555  				}
   556  			},
   557  		},
   558  		{
   559  			// half the allocs have been moved to the new node but 2 don't have
   560  			// health set yet, so we should have MaxParallel - 2 in flight
   561  			name:           "pending-health-blocks-higher-max",
   562  			expectDrained:  2,
   563  			expectMigrated: 1,
   564  			maxParallel:    5,
   565  			addAllocFn: func(i int, a *structs.Allocation, drainingID, runningID string) {
   566  				switch i {
   567  				case 0, 1:
   568  					// Deployment status UNset for 2 on new node
   569  					a.NodeID = runningID
   570  					a.DeploymentStatus = nil
   571  				case 2, 3, 4:
   572  					// Deployment status set for 3 on new node
   573  					a.NodeID = runningID
   574  				case 9:
   575  					a.DesiredStatus = structs.AllocDesiredStatusStop
   576  					a.ClientStatus = structs.AllocClientStatusComplete
   577  				}
   578  			},
   579  		},
   580  	}
   581  
   582  	for _, tc := range testCases {
   583  		tc := tc
   584  		t.Run(tc.name, func(t *testing.T) {
   585  			ci.Parallel(t)
   586  
   587  			// Create nodes
   588  			store := state.TestStateStore(t)
   589  			drainingNode, runningNode := testNodes(t, store)
   590  
   591  			job := mock.Job()
   592  			if tc.batch {
   593  				job = mock.BatchJob()
   594  			}
   595  			job.TaskGroups[0].Count = 10
   596  			if tc.allocCount > 0 {
   597  				job.TaskGroups[0].Count = tc.allocCount
   598  			}
   599  			if tc.maxParallel > 0 {
   600  				job.TaskGroups[0].Migrate.MaxParallel = tc.maxParallel
   601  			}
   602  			must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, 102, nil, job))
   603  
   604  			var allocs []*structs.Allocation
   605  			for i := 0; i < 10; i++ {
   606  				a := mock.Alloc()
   607  				if tc.batch {
   608  					a = mock.BatchAlloc()
   609  				}
   610  				a.JobID = job.ID
   611  				a.Job = job
   612  				a.TaskGroup = job.TaskGroups[0].Name
   613  
   614  				// Default to being healthy on the draining node
   615  				a.NodeID = drainingNode.ID
   616  				a.DeploymentStatus = &structs.AllocDeploymentStatus{
   617  					Healthy: pointer.Of(true),
   618  				}
   619  				if tc.addAllocFn != nil {
   620  					tc.addAllocFn(i, a, drainingNode.ID, runningNode.ID)
   621  				}
   622  				allocs = append(allocs, a)
   623  			}
   624  
   625  			must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, 103, allocs))
   626  			snap, err := store.Snapshot()
   627  			must.NoError(t, err)
   628  
   629  			res := newJobResult()
   630  			must.NoError(t, handleTaskGroup(snap, tc.batch, job.TaskGroups[0], allocs, 102, res))
   631  			test.Len(t, tc.expectDrained, res.drain, test.Sprint("expected drained allocs"))
   632  			test.Len(t, tc.expectMigrated, res.migrated, test.Sprint("expected migrated allocs"))
   633  			test.Eq(t, tc.expectDone, res.done)
   634  		})
   635  	}
   636  }
   637  
   638  func TestHandleTaskGroup_Migrations(t *testing.T) {
   639  	ci.Parallel(t)
   640  	require := require.New(t)
   641  
   642  	// Create a draining node
   643  	state := state.TestStateStore(t)
   644  	n := mock.Node()
   645  	n.DrainStrategy = &structs.DrainStrategy{
   646  		DrainSpec: structs.DrainSpec{
   647  			Deadline: 5 * time.Minute,
   648  		},
   649  		ForceDeadline: time.Now().Add(1 * time.Minute),
   650  	}
   651  	require.Nil(state.UpsertNode(structs.MsgTypeTestSetup, 100, n))
   652  
   653  	job := mock.Job()
   654  	require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 101, nil, job))
   655  
   656  	// Create 10 done allocs
   657  	var allocs []*structs.Allocation
   658  	for i := 0; i < 10; i++ {
   659  		a := mock.Alloc()
   660  		a.Job = job
   661  		a.TaskGroup = job.TaskGroups[0].Name
   662  		a.NodeID = n.ID
   663  		a.DeploymentStatus = &structs.AllocDeploymentStatus{
   664  			Healthy: pointer.Of(false),
   665  		}
   666  
   667  		if i%2 == 0 {
   668  			a.DesiredStatus = structs.AllocDesiredStatusStop
   669  			a.ClientStatus = structs.AllocClientStatusComplete
   670  		} else {
   671  			a.ClientStatus = structs.AllocClientStatusFailed
   672  		}
   673  		allocs = append(allocs, a)
   674  	}
   675  	require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 102, allocs))
   676  
   677  	snap, err := state.Snapshot()
   678  	require.Nil(err)
   679  
   680  	// Handle before and after indexes as both service and batch
   681  	res := newJobResult()
   682  	require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res))
   683  	require.Empty(res.drain)
   684  	require.Len(res.migrated, 10)
   685  	require.True(res.done)
   686  
   687  	res = newJobResult()
   688  	require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res))
   689  	require.Empty(res.drain)
   690  	require.Len(res.migrated, 10)
   691  	require.True(res.done)
   692  
   693  	res = newJobResult()
   694  	require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res))
   695  	require.Empty(res.drain)
   696  	require.Empty(res.migrated)
   697  	require.True(res.done)
   698  
   699  	res = newJobResult()
   700  	require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res))
   701  	require.Empty(res.drain)
   702  	require.Empty(res.migrated)
   703  	require.True(res.done)
   704  }
   705  
   706  // This test asserts that handle task group works when an allocation is on a
   707  // garbage collected node
   708  func TestHandleTaskGroup_GarbageCollectedNode(t *testing.T) {
   709  	ci.Parallel(t)
   710  	require := require.New(t)
   711  
   712  	// Create a draining node
   713  	state := state.TestStateStore(t)
   714  	n := mock.Node()
   715  	n.DrainStrategy = &structs.DrainStrategy{
   716  		DrainSpec: structs.DrainSpec{
   717  			Deadline: 5 * time.Minute,
   718  		},
   719  		ForceDeadline: time.Now().Add(1 * time.Minute),
   720  	}
   721  	require.Nil(state.UpsertNode(structs.MsgTypeTestSetup, 100, n))
   722  
   723  	job := mock.Job()
   724  	require.Nil(state.UpsertJob(structs.MsgTypeTestSetup, 101, nil, job))
   725  
   726  	// Create 10 done allocs
   727  	var allocs []*structs.Allocation
   728  	for i := 0; i < 10; i++ {
   729  		a := mock.Alloc()
   730  		a.Job = job
   731  		a.TaskGroup = job.TaskGroups[0].Name
   732  		a.NodeID = n.ID
   733  		a.DeploymentStatus = &structs.AllocDeploymentStatus{
   734  			Healthy: pointer.Of(false),
   735  		}
   736  
   737  		if i%2 == 0 {
   738  			a.DesiredStatus = structs.AllocDesiredStatusStop
   739  			a.ClientStatus = structs.AllocClientStatusComplete
   740  		} else {
   741  			a.ClientStatus = structs.AllocClientStatusFailed
   742  		}
   743  		allocs = append(allocs, a)
   744  	}
   745  
   746  	// Make the first one be on a GC'd node
   747  	allocs[0].NodeID = uuid.Generate()
   748  	require.Nil(state.UpsertAllocs(structs.MsgTypeTestSetup, 102, allocs))
   749  
   750  	snap, err := state.Snapshot()
   751  	require.Nil(err)
   752  
   753  	// Handle before and after indexes as both service and batch
   754  	res := newJobResult()
   755  	require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 101, res))
   756  	require.Empty(res.drain)
   757  	require.Len(res.migrated, 9)
   758  	require.True(res.done)
   759  
   760  	res = newJobResult()
   761  	require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 101, res))
   762  	require.Empty(res.drain)
   763  	require.Len(res.migrated, 9)
   764  	require.True(res.done)
   765  
   766  	res = newJobResult()
   767  	require.Nil(handleTaskGroup(snap, false, job.TaskGroups[0], allocs, 103, res))
   768  	require.Empty(res.drain)
   769  	require.Empty(res.migrated)
   770  	require.True(res.done)
   771  
   772  	res = newJobResult()
   773  	require.Nil(handleTaskGroup(snap, true, job.TaskGroups[0], allocs, 103, res))
   774  	require.Empty(res.drain)
   775  	require.Empty(res.migrated)
   776  	require.True(res.done)
   777  }