github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/core_sched_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"testing"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/helper/uuid"
    10  	"github.com/hashicorp/nomad/nomad/mock"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	"github.com/hashicorp/nomad/testutil"
    13  	"github.com/stretchr/testify/assert"
    14  	"github.com/stretchr/testify/require"
    15  )
    16  
    17  func TestCoreScheduler_EvalGC(t *testing.T) {
    18  	t.Parallel()
    19  	s1 := TestServer(t, nil)
    20  	defer s1.Shutdown()
    21  	testutil.WaitForLeader(t, s1.RPC)
    22  	require := require.New(t)
    23  
    24  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
    25  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
    26  
    27  	// Insert "dead" eval
    28  	state := s1.fsm.State()
    29  	eval := mock.Eval()
    30  	eval.Status = structs.EvalStatusFailed
    31  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
    32  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
    33  	require.Nil(err)
    34  
    35  	// Insert mock job with rescheduling disabled
    36  	job := mock.Job()
    37  	job.ID = eval.JobID
    38  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
    39  		Attempts: 0,
    40  		Interval: 0 * time.Second,
    41  	}
    42  	err = state.UpsertJob(1001, job)
    43  	require.Nil(err)
    44  
    45  	// Insert "dead" alloc
    46  	alloc := mock.Alloc()
    47  	alloc.EvalID = eval.ID
    48  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
    49  	alloc.JobID = eval.JobID
    50  	alloc.TaskGroup = job.TaskGroups[0].Name
    51  
    52  	// Insert "lost" alloc
    53  	alloc2 := mock.Alloc()
    54  	alloc2.EvalID = eval.ID
    55  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
    56  	alloc2.ClientStatus = structs.AllocClientStatusLost
    57  	alloc2.JobID = eval.JobID
    58  	alloc2.TaskGroup = job.TaskGroups[0].Name
    59  	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
    60  	if err != nil {
    61  		t.Fatalf("err: %v", err)
    62  	}
    63  
    64  	// Update the time tables to make this work
    65  	tt := s1.fsm.TimeTable()
    66  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
    67  
    68  	// Create a core scheduler
    69  	snap, err := state.Snapshot()
    70  	if err != nil {
    71  		t.Fatalf("err: %v", err)
    72  	}
    73  	core := NewCoreScheduler(s1, snap)
    74  
    75  	// Attempt the GC
    76  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
    77  	err = core.Process(gc)
    78  	if err != nil {
    79  		t.Fatalf("err: %v", err)
    80  	}
    81  
    82  	// Should be gone
    83  	ws := memdb.NewWatchSet()
    84  	out, err := state.EvalByID(ws, eval.ID)
    85  	if err != nil {
    86  		t.Fatalf("err: %v", err)
    87  	}
    88  	if out != nil {
    89  		t.Fatalf("bad: %v", out)
    90  	}
    91  
    92  	outA, err := state.AllocByID(ws, alloc.ID)
    93  	if err != nil {
    94  		t.Fatalf("err: %v", err)
    95  	}
    96  	if outA != nil {
    97  		t.Fatalf("bad: %v", outA)
    98  	}
    99  
   100  	outA2, err := state.AllocByID(ws, alloc2.ID)
   101  	if err != nil {
   102  		t.Fatalf("err: %v", err)
   103  	}
   104  	if outA2 != nil {
   105  		t.Fatalf("bad: %v", outA2)
   106  	}
   107  }
   108  
   109  // Tests GC behavior on allocations being rescheduled
   110  func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
   111  	t.Parallel()
   112  	s1 := TestServer(t, nil)
   113  	defer s1.Shutdown()
   114  	testutil.WaitForLeader(t, s1.RPC)
   115  	require := require.New(t)
   116  
   117  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   118  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   119  
   120  	// Insert "dead" eval
   121  	state := s1.fsm.State()
   122  	eval := mock.Eval()
   123  	eval.Status = structs.EvalStatusFailed
   124  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   125  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   126  	require.Nil(err)
   127  
   128  	// Insert "pending" eval for same job
   129  	eval2 := mock.Eval()
   130  	eval2.JobID = eval.JobID
   131  	state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID))
   132  	err = state.UpsertEvals(1003, []*structs.Evaluation{eval2})
   133  	require.Nil(err)
   134  
   135  	// Insert mock job with default reschedule policy of 2 in 10 minutes
   136  	job := mock.Job()
   137  	job.ID = eval.JobID
   138  
   139  	err = state.UpsertJob(1001, job)
   140  	require.Nil(err)
   141  
   142  	// Insert failed alloc with an old reschedule attempt, can be GCed
   143  	alloc := mock.Alloc()
   144  	alloc.EvalID = eval.ID
   145  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
   146  	alloc.ClientStatus = structs.AllocClientStatusFailed
   147  	alloc.JobID = eval.JobID
   148  	alloc.TaskGroup = job.TaskGroups[0].Name
   149  	alloc.NextAllocation = uuid.Generate()
   150  	alloc.RescheduleTracker = &structs.RescheduleTracker{
   151  		Events: []*structs.RescheduleEvent{
   152  			{
   153  				RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
   154  				PrevNodeID:     uuid.Generate(),
   155  				PrevAllocID:    uuid.Generate(),
   156  			},
   157  		},
   158  	}
   159  
   160  	alloc2 := mock.Alloc()
   161  	alloc2.EvalID = eval.ID
   162  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   163  	alloc2.ClientStatus = structs.AllocClientStatusFailed
   164  	alloc2.JobID = eval.JobID
   165  	alloc2.TaskGroup = job.TaskGroups[0].Name
   166  	alloc2.RescheduleTracker = &structs.RescheduleTracker{
   167  		Events: []*structs.RescheduleEvent{
   168  			{
   169  				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
   170  				PrevNodeID:     uuid.Generate(),
   171  				PrevAllocID:    uuid.Generate(),
   172  			},
   173  		},
   174  	}
   175  	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
   176  	require.Nil(err)
   177  
   178  	// Update the time tables to make this work
   179  	tt := s1.fsm.TimeTable()
   180  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   181  
   182  	// Create a core scheduler
   183  	snap, err := state.Snapshot()
   184  	if err != nil {
   185  		t.Fatalf("err: %v", err)
   186  	}
   187  	core := NewCoreScheduler(s1, snap)
   188  
   189  	// Attempt the GC, job has all terminal allocs and one pending eval
   190  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   191  	err = core.Process(gc)
   192  	require.Nil(err)
   193  
   194  	// Eval should still exist
   195  	ws := memdb.NewWatchSet()
   196  	out, err := state.EvalByID(ws, eval.ID)
   197  	require.Nil(err)
   198  	require.NotNil(out)
   199  	require.Equal(eval.ID, out.ID)
   200  
   201  	outA, err := state.AllocByID(ws, alloc.ID)
   202  	require.Nil(err)
   203  	require.Nil(outA)
   204  
   205  	outA2, err := state.AllocByID(ws, alloc2.ID)
   206  	require.Nil(err)
   207  	require.Equal(alloc2.ID, outA2.ID)
   208  
   209  }
   210  
   211  // Tests GC behavior on stopped job with reschedulable allocs
   212  func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) {
   213  	t.Parallel()
   214  	s1 := TestServer(t, nil)
   215  	defer s1.Shutdown()
   216  	testutil.WaitForLeader(t, s1.RPC)
   217  	require := require.New(t)
   218  
   219  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   220  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   221  
   222  	// Insert "dead" eval
   223  	state := s1.fsm.State()
   224  	eval := mock.Eval()
   225  	eval.Status = structs.EvalStatusFailed
   226  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   227  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   228  	require.Nil(err)
   229  
   230  	// Insert mock stopped job with default reschedule policy of 2 in 10 minutes
   231  	job := mock.Job()
   232  	job.ID = eval.JobID
   233  	job.Stop = true
   234  
   235  	err = state.UpsertJob(1001, job)
   236  	require.Nil(err)
   237  
   238  	// Insert failed alloc with a recent reschedule attempt
   239  	alloc := mock.Alloc()
   240  	alloc.EvalID = eval.ID
   241  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
   242  	alloc.ClientStatus = structs.AllocClientStatusLost
   243  	alloc.JobID = eval.JobID
   244  	alloc.TaskGroup = job.TaskGroups[0].Name
   245  	alloc.RescheduleTracker = &structs.RescheduleTracker{
   246  		Events: []*structs.RescheduleEvent{
   247  			{
   248  				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
   249  				PrevNodeID:     uuid.Generate(),
   250  				PrevAllocID:    uuid.Generate(),
   251  			},
   252  		},
   253  	}
   254  	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc})
   255  	require.Nil(err)
   256  
   257  	// Update the time tables to make this work
   258  	tt := s1.fsm.TimeTable()
   259  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   260  
   261  	// Create a core scheduler
   262  	snap, err := state.Snapshot()
   263  	if err != nil {
   264  		t.Fatalf("err: %v", err)
   265  	}
   266  	core := NewCoreScheduler(s1, snap)
   267  
   268  	// Attempt the GC
   269  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   270  	err = core.Process(gc)
   271  	require.Nil(err)
   272  
   273  	// Eval should not exist
   274  	ws := memdb.NewWatchSet()
   275  	out, err := state.EvalByID(ws, eval.ID)
   276  	require.Nil(err)
   277  	require.Nil(out)
   278  
   279  	// Alloc should not exist
   280  	outA, err := state.AllocByID(ws, alloc.ID)
   281  	require.Nil(err)
   282  	require.Nil(outA)
   283  
   284  }
   285  
   286  // An EvalGC should never reap a batch job that has not been stopped
   287  func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
   288  	t.Parallel()
   289  	s1 := TestServer(t, nil)
   290  	defer s1.Shutdown()
   291  	testutil.WaitForLeader(t, s1.RPC)
   292  
   293  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   294  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   295  
   296  	// Insert a "dead" job
   297  	state := s1.fsm.State()
   298  	job := mock.Job()
   299  	job.Type = structs.JobTypeBatch
   300  	job.Status = structs.JobStatusDead
   301  	err := state.UpsertJob(1000, job)
   302  	if err != nil {
   303  		t.Fatalf("err: %v", err)
   304  	}
   305  
   306  	// Insert "complete" eval
   307  	eval := mock.Eval()
   308  	eval.Status = structs.EvalStatusComplete
   309  	eval.Type = structs.JobTypeBatch
   310  	eval.JobID = job.ID
   311  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
   312  	if err != nil {
   313  		t.Fatalf("err: %v", err)
   314  	}
   315  
   316  	// Insert "failed" alloc
   317  	alloc := mock.Alloc()
   318  	alloc.JobID = job.ID
   319  	alloc.EvalID = eval.ID
   320  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   321  
   322  	// Insert "lost" alloc
   323  	alloc2 := mock.Alloc()
   324  	alloc2.JobID = job.ID
   325  	alloc2.EvalID = eval.ID
   326  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   327  	alloc2.ClientStatus = structs.AllocClientStatusLost
   328  
   329  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
   330  	if err != nil {
   331  		t.Fatalf("err: %v", err)
   332  	}
   333  
   334  	// Update the time tables to make this work
   335  	tt := s1.fsm.TimeTable()
   336  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   337  
   338  	// Create a core scheduler
   339  	snap, err := state.Snapshot()
   340  	if err != nil {
   341  		t.Fatalf("err: %v", err)
   342  	}
   343  	core := NewCoreScheduler(s1, snap)
   344  
   345  	// Attempt the GC
   346  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   347  	err = core.Process(gc)
   348  	if err != nil {
   349  		t.Fatalf("err: %v", err)
   350  	}
   351  
   352  	// Nothing should be gone
   353  	ws := memdb.NewWatchSet()
   354  	out, err := state.EvalByID(ws, eval.ID)
   355  	if err != nil {
   356  		t.Fatalf("err: %v", err)
   357  	}
   358  	if out == nil {
   359  		t.Fatalf("bad: %v", out)
   360  	}
   361  
   362  	outA, err := state.AllocByID(ws, alloc.ID)
   363  	if err != nil {
   364  		t.Fatalf("err: %v", err)
   365  	}
   366  	if outA == nil {
   367  		t.Fatalf("bad: %v", outA)
   368  	}
   369  
   370  	outA2, err := state.AllocByID(ws, alloc2.ID)
   371  	if err != nil {
   372  		t.Fatalf("err: %v", err)
   373  	}
   374  	if outA2 == nil {
   375  		t.Fatalf("bad: %v", outA2)
   376  	}
   377  
   378  	outB, err := state.JobByID(ws, job.Namespace, job.ID)
   379  	if err != nil {
   380  		t.Fatalf("err: %v", err)
   381  	}
   382  	if outB == nil {
   383  		t.Fatalf("bad: %v", outB)
   384  	}
   385  }
   386  
   387  // An EvalGC should  reap a batch job that has been stopped
   388  func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
   389  	t.Parallel()
   390  	s1 := TestServer(t, nil)
   391  	defer s1.Shutdown()
   392  	testutil.WaitForLeader(t, s1.RPC)
   393  
   394  	require := require.New(t)
   395  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   396  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   397  
   398  	// Create a "dead" job
   399  	state := s1.fsm.State()
   400  	job := mock.Job()
   401  	job.Type = structs.JobTypeBatch
   402  	job.Status = structs.JobStatusDead
   403  	job.Stop = true
   404  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
   405  		Attempts: 0,
   406  		Interval: 0 * time.Second,
   407  	}
   408  	err := state.UpsertJob(1001, job)
   409  	require.Nil(err)
   410  
   411  	// Insert "complete" eval
   412  	eval := mock.Eval()
   413  	eval.Status = structs.EvalStatusComplete
   414  	eval.Type = structs.JobTypeBatch
   415  	eval.JobID = job.ID
   416  	err = state.UpsertEvals(1002, []*structs.Evaluation{eval})
   417  	require.Nil(err)
   418  
   419  	// Insert "failed" alloc
   420  	alloc := mock.Alloc()
   421  	alloc.JobID = job.ID
   422  	alloc.EvalID = eval.ID
   423  	alloc.TaskGroup = job.TaskGroups[0].Name
   424  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   425  
   426  	// Insert "lost" alloc
   427  	alloc2 := mock.Alloc()
   428  	alloc2.JobID = job.ID
   429  	alloc2.EvalID = eval.ID
   430  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   431  	alloc2.ClientStatus = structs.AllocClientStatusLost
   432  	alloc2.TaskGroup = job.TaskGroups[0].Name
   433  
   434  	err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2})
   435  	if err != nil {
   436  		t.Fatalf("err: %v", err)
   437  	}
   438  
   439  	// Update the time tables to make this work
   440  	tt := s1.fsm.TimeTable()
   441  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   442  
   443  	// Create a core scheduler
   444  	snap, err := state.Snapshot()
   445  	if err != nil {
   446  		t.Fatalf("err: %v", err)
   447  	}
   448  	core := NewCoreScheduler(s1, snap)
   449  
   450  	// Attempt the GC
   451  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   452  	err = core.Process(gc)
   453  	if err != nil {
   454  		t.Fatalf("err: %v", err)
   455  	}
   456  
   457  	// Everything should be gone
   458  	ws := memdb.NewWatchSet()
   459  	out, err := state.EvalByID(ws, eval.ID)
   460  	if err != nil {
   461  		t.Fatalf("err: %v", err)
   462  	}
   463  	if out != nil {
   464  		t.Fatalf("bad: %v", out)
   465  	}
   466  
   467  	outA, err := state.AllocByID(ws, alloc.ID)
   468  	if err != nil {
   469  		t.Fatalf("err: %v", err)
   470  	}
   471  	if outA != nil {
   472  		t.Fatalf("bad: %v", outA)
   473  	}
   474  
   475  	outA2, err := state.AllocByID(ws, alloc2.ID)
   476  	if err != nil {
   477  		t.Fatalf("err: %v", err)
   478  	}
   479  	if outA2 != nil {
   480  		t.Fatalf("bad: %v", outA2)
   481  	}
   482  }
   483  
   484  func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
   485  	t.Parallel()
   486  	s1 := TestServer(t, nil)
   487  	defer s1.Shutdown()
   488  	testutil.WaitForLeader(t, s1.RPC)
   489  	require := require.New(t)
   490  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   491  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   492  
   493  	// Insert "dead" eval
   494  	state := s1.fsm.State()
   495  	eval := mock.Eval()
   496  	eval.Status = structs.EvalStatusComplete
   497  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   498  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   499  	if err != nil {
   500  		t.Fatalf("err: %v", err)
   501  	}
   502  
   503  	// Create mock job with id same as eval
   504  	job := mock.Job()
   505  	job.ID = eval.JobID
   506  
   507  	// Insert "dead" alloc
   508  	alloc := mock.Alloc()
   509  	alloc.JobID = job.ID
   510  	alloc.EvalID = eval.ID
   511  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   512  	alloc.TaskGroup = job.TaskGroups[0].Name
   513  	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   514  
   515  	// Insert "lost" alloc
   516  	alloc2 := mock.Alloc()
   517  	alloc2.JobID = job.ID
   518  	alloc2.EvalID = eval.ID
   519  	alloc2.TaskGroup = job.TaskGroups[0].Name
   520  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   521  	alloc2.ClientStatus = structs.AllocClientStatusLost
   522  
   523  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
   524  	if err != nil {
   525  		t.Fatalf("err: %v", err)
   526  	}
   527  
   528  	// Insert "running" alloc
   529  	alloc3 := mock.Alloc()
   530  	alloc3.EvalID = eval.ID
   531  	alloc3.JobID = job.ID
   532  	state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID))
   533  	err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3})
   534  	if err != nil {
   535  		t.Fatalf("err: %v", err)
   536  	}
   537  
   538  	// Insert mock job with rescheduling disabled
   539  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
   540  		Attempts: 0,
   541  		Interval: 0 * time.Second,
   542  	}
   543  	err = state.UpsertJob(1001, job)
   544  	require.Nil(err)
   545  
   546  	// Update the time tables to make this work
   547  	tt := s1.fsm.TimeTable()
   548  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   549  
   550  	// Create a core scheduler
   551  	snap, err := state.Snapshot()
   552  	if err != nil {
   553  		t.Fatalf("err: %v", err)
   554  	}
   555  	core := NewCoreScheduler(s1, snap)
   556  
   557  	// Attempt the GC
   558  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   559  	err = core.Process(gc)
   560  	if err != nil {
   561  		t.Fatalf("err: %v", err)
   562  	}
   563  
   564  	// Should not be gone
   565  	ws := memdb.NewWatchSet()
   566  	out, err := state.EvalByID(ws, eval.ID)
   567  	if err != nil {
   568  		t.Fatalf("err: %v", err)
   569  	}
   570  	if out == nil {
   571  		t.Fatalf("bad: %v", out)
   572  	}
   573  
   574  	outA, err := state.AllocByID(ws, alloc3.ID)
   575  	if err != nil {
   576  		t.Fatalf("err: %v", err)
   577  	}
   578  	if outA == nil {
   579  		t.Fatalf("bad: %v", outA)
   580  	}
   581  
   582  	// Should be gone
   583  	outB, err := state.AllocByID(ws, alloc.ID)
   584  	if err != nil {
   585  		t.Fatalf("err: %v", err)
   586  	}
   587  	if outB != nil {
   588  		t.Fatalf("bad: %v", outB)
   589  	}
   590  
   591  	outC, err := state.AllocByID(ws, alloc2.ID)
   592  	if err != nil {
   593  		t.Fatalf("err: %v", err)
   594  	}
   595  	if outC != nil {
   596  		t.Fatalf("bad: %v", outC)
   597  	}
   598  }
   599  
   600  func TestCoreScheduler_EvalGC_Force(t *testing.T) {
   601  	t.Parallel()
   602  	for _, withAcl := range []bool{false, true} {
   603  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
   604  			require := require.New(t)
   605  			var server *Server
   606  			if withAcl {
   607  				server, _ = TestACLServer(t, nil)
   608  			} else {
   609  				server = TestServer(t, nil)
   610  			}
   611  			defer server.Shutdown()
   612  			testutil.WaitForLeader(t, server.RPC)
   613  
   614  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   615  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   616  
   617  			// Insert "dead" eval
   618  			state := server.fsm.State()
   619  			eval := mock.Eval()
   620  			eval.Status = structs.EvalStatusFailed
   621  			state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   622  			err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   623  			if err != nil {
   624  				t.Fatalf("err: %v", err)
   625  			}
   626  
   627  			// Insert mock job with rescheduling disabled
   628  			job := mock.Job()
   629  			job.ID = eval.JobID
   630  			job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
   631  				Attempts: 0,
   632  				Interval: 0 * time.Second,
   633  			}
   634  			err = state.UpsertJob(1001, job)
   635  			require.Nil(err)
   636  
   637  			// Insert "dead" alloc
   638  			alloc := mock.Alloc()
   639  			alloc.EvalID = eval.ID
   640  			alloc.DesiredStatus = structs.AllocDesiredStatusStop
   641  			alloc.TaskGroup = job.TaskGroups[0].Name
   642  			state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   643  			err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
   644  			if err != nil {
   645  				t.Fatalf("err: %v", err)
   646  			}
   647  
   648  			// Create a core scheduler
   649  			snap, err := state.Snapshot()
   650  			if err != nil {
   651  				t.Fatalf("err: %v", err)
   652  			}
   653  			core := NewCoreScheduler(server, snap)
   654  
   655  			// Attempt the GC
   656  			gc := server.coreJobEval(structs.CoreJobForceGC, 1002)
   657  			err = core.Process(gc)
   658  			if err != nil {
   659  				t.Fatalf("err: %v", err)
   660  			}
   661  
   662  			// Should be gone
   663  			ws := memdb.NewWatchSet()
   664  			out, err := state.EvalByID(ws, eval.ID)
   665  			if err != nil {
   666  				t.Fatalf("err: %v", err)
   667  			}
   668  			if out != nil {
   669  				t.Fatalf("bad: %v", out)
   670  			}
   671  
   672  			outA, err := state.AllocByID(ws, alloc.ID)
   673  			if err != nil {
   674  				t.Fatalf("err: %v", err)
   675  			}
   676  			if outA != nil {
   677  				t.Fatalf("bad: %v", outA)
   678  			}
   679  		})
   680  	}
   681  }
   682  
   683  func TestCoreScheduler_NodeGC(t *testing.T) {
   684  	t.Parallel()
   685  	for _, withAcl := range []bool{false, true} {
   686  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
   687  			var server *Server
   688  			if withAcl {
   689  				server, _ = TestACLServer(t, nil)
   690  			} else {
   691  				server = TestServer(t, nil)
   692  			}
   693  			defer server.Shutdown()
   694  			testutil.WaitForLeader(t, server.RPC)
   695  
   696  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   697  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   698  
   699  			// Insert "dead" node
   700  			state := server.fsm.State()
   701  			node := mock.Node()
   702  			node.Status = structs.NodeStatusDown
   703  			err := state.UpsertNode(1000, node)
   704  			if err != nil {
   705  				t.Fatalf("err: %v", err)
   706  			}
   707  
   708  			// Update the time tables to make this work
   709  			tt := server.fsm.TimeTable()
   710  			tt.Witness(2000, time.Now().UTC().Add(-1*server.config.NodeGCThreshold))
   711  
   712  			// Create a core scheduler
   713  			snap, err := state.Snapshot()
   714  			if err != nil {
   715  				t.Fatalf("err: %v", err)
   716  			}
   717  			core := NewCoreScheduler(server, snap)
   718  
   719  			// Attempt the GC
   720  			gc := server.coreJobEval(structs.CoreJobNodeGC, 2000)
   721  			err = core.Process(gc)
   722  			if err != nil {
   723  				t.Fatalf("err: %v", err)
   724  			}
   725  
   726  			// Should be gone
   727  			ws := memdb.NewWatchSet()
   728  			out, err := state.NodeByID(ws, node.ID)
   729  			if err != nil {
   730  				t.Fatalf("err: %v", err)
   731  			}
   732  			if out != nil {
   733  				t.Fatalf("bad: %v", out)
   734  			}
   735  		})
   736  	}
   737  }
   738  
   739  func TestCoreScheduler_NodeGC_TerminalAllocs(t *testing.T) {
   740  	t.Parallel()
   741  	s1 := TestServer(t, nil)
   742  	defer s1.Shutdown()
   743  	testutil.WaitForLeader(t, s1.RPC)
   744  
   745  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   746  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   747  
   748  	// Insert "dead" node
   749  	state := s1.fsm.State()
   750  	node := mock.Node()
   751  	node.Status = structs.NodeStatusDown
   752  	err := state.UpsertNode(1000, node)
   753  	if err != nil {
   754  		t.Fatalf("err: %v", err)
   755  	}
   756  
   757  	// Insert a terminal alloc on that node
   758  	alloc := mock.Alloc()
   759  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   760  	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   761  	if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil {
   762  		t.Fatalf("err: %v", err)
   763  	}
   764  
   765  	// Update the time tables to make this work
   766  	tt := s1.fsm.TimeTable()
   767  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold))
   768  
   769  	// Create a core scheduler
   770  	snap, err := state.Snapshot()
   771  	if err != nil {
   772  		t.Fatalf("err: %v", err)
   773  	}
   774  	core := NewCoreScheduler(s1, snap)
   775  
   776  	// Attempt the GC
   777  	gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000)
   778  	err = core.Process(gc)
   779  	if err != nil {
   780  		t.Fatalf("err: %v", err)
   781  	}
   782  
   783  	// Should be gone
   784  	ws := memdb.NewWatchSet()
   785  	out, err := state.NodeByID(ws, node.ID)
   786  	if err != nil {
   787  		t.Fatalf("err: %v", err)
   788  	}
   789  	if out != nil {
   790  		t.Fatalf("bad: %v", out)
   791  	}
   792  }
   793  
   794  func TestCoreScheduler_NodeGC_RunningAllocs(t *testing.T) {
   795  	t.Parallel()
   796  	s1 := TestServer(t, nil)
   797  	defer s1.Shutdown()
   798  	testutil.WaitForLeader(t, s1.RPC)
   799  
   800  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   801  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   802  
   803  	// Insert "dead" node
   804  	state := s1.fsm.State()
   805  	node := mock.Node()
   806  	node.Status = structs.NodeStatusDown
   807  	err := state.UpsertNode(1000, node)
   808  	if err != nil {
   809  		t.Fatalf("err: %v", err)
   810  	}
   811  
   812  	// Insert a running alloc on that node
   813  	alloc := mock.Alloc()
   814  	alloc.NodeID = node.ID
   815  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
   816  	alloc.ClientStatus = structs.AllocClientStatusRunning
   817  	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   818  	if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil {
   819  		t.Fatalf("err: %v", err)
   820  	}
   821  
   822  	// Update the time tables to make this work
   823  	tt := s1.fsm.TimeTable()
   824  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold))
   825  
   826  	// Create a core scheduler
   827  	snap, err := state.Snapshot()
   828  	if err != nil {
   829  		t.Fatalf("err: %v", err)
   830  	}
   831  	core := NewCoreScheduler(s1, snap)
   832  
   833  	// Attempt the GC
   834  	gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000)
   835  	err = core.Process(gc)
   836  	if err != nil {
   837  		t.Fatalf("err: %v", err)
   838  	}
   839  
   840  	// Should still be here
   841  	ws := memdb.NewWatchSet()
   842  	out, err := state.NodeByID(ws, node.ID)
   843  	if err != nil {
   844  		t.Fatalf("err: %v", err)
   845  	}
   846  	if out == nil {
   847  		t.Fatalf("bad: %v", out)
   848  	}
   849  }
   850  
   851  func TestCoreScheduler_NodeGC_Force(t *testing.T) {
   852  	t.Parallel()
   853  	s1 := TestServer(t, nil)
   854  	defer s1.Shutdown()
   855  	testutil.WaitForLeader(t, s1.RPC)
   856  
   857  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   858  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   859  
   860  	// Insert "dead" node
   861  	state := s1.fsm.State()
   862  	node := mock.Node()
   863  	node.Status = structs.NodeStatusDown
   864  	err := state.UpsertNode(1000, node)
   865  	if err != nil {
   866  		t.Fatalf("err: %v", err)
   867  	}
   868  
   869  	// Create a core scheduler
   870  	snap, err := state.Snapshot()
   871  	if err != nil {
   872  		t.Fatalf("err: %v", err)
   873  	}
   874  	core := NewCoreScheduler(s1, snap)
   875  
   876  	// Attempt the GC
   877  	gc := s1.coreJobEval(structs.CoreJobForceGC, 1000)
   878  	err = core.Process(gc)
   879  	if err != nil {
   880  		t.Fatalf("err: %v", err)
   881  	}
   882  
   883  	// Should be gone
   884  	ws := memdb.NewWatchSet()
   885  	out, err := state.NodeByID(ws, node.ID)
   886  	if err != nil {
   887  		t.Fatalf("err: %v", err)
   888  	}
   889  	if out != nil {
   890  		t.Fatalf("bad: %v", out)
   891  	}
   892  }
   893  
   894  func TestCoreScheduler_JobGC_OutstandingEvals(t *testing.T) {
   895  	t.Parallel()
   896  	s1 := TestServer(t, nil)
   897  	defer s1.Shutdown()
   898  	testutil.WaitForLeader(t, s1.RPC)
   899  
   900  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   901  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   902  
   903  	// Insert job.
   904  	state := s1.fsm.State()
   905  	job := mock.Job()
   906  	job.Type = structs.JobTypeBatch
   907  	job.Status = structs.JobStatusDead
   908  	err := state.UpsertJob(1000, job)
   909  	if err != nil {
   910  		t.Fatalf("err: %v", err)
   911  	}
   912  
   913  	// Insert two evals, one terminal and one not
   914  	eval := mock.Eval()
   915  	eval.JobID = job.ID
   916  	eval.Status = structs.EvalStatusComplete
   917  
   918  	eval2 := mock.Eval()
   919  	eval2.JobID = job.ID
   920  	eval2.Status = structs.EvalStatusPending
   921  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2})
   922  	if err != nil {
   923  		t.Fatalf("err: %v", err)
   924  	}
   925  
   926  	// Update the time tables to make this work
   927  	tt := s1.fsm.TimeTable()
   928  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
   929  
   930  	// Create a core scheduler
   931  	snap, err := state.Snapshot()
   932  	if err != nil {
   933  		t.Fatalf("err: %v", err)
   934  	}
   935  	core := NewCoreScheduler(s1, snap)
   936  
   937  	// Attempt the GC
   938  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
   939  	err = core.Process(gc)
   940  	if err != nil {
   941  		t.Fatalf("err: %v", err)
   942  	}
   943  
   944  	// Should still exist
   945  	ws := memdb.NewWatchSet()
   946  	out, err := state.JobByID(ws, job.Namespace, job.ID)
   947  	if err != nil {
   948  		t.Fatalf("err: %v", err)
   949  	}
   950  	if out == nil {
   951  		t.Fatalf("bad: %v", out)
   952  	}
   953  
   954  	outE, err := state.EvalByID(ws, eval.ID)
   955  	if err != nil {
   956  		t.Fatalf("err: %v", err)
   957  	}
   958  	if outE == nil {
   959  		t.Fatalf("bad: %v", outE)
   960  	}
   961  
   962  	outE2, err := state.EvalByID(ws, eval2.ID)
   963  	if err != nil {
   964  		t.Fatalf("err: %v", err)
   965  	}
   966  	if outE2 == nil {
   967  		t.Fatalf("bad: %v", outE2)
   968  	}
   969  
   970  	// Update the second eval to be terminal
   971  	eval2.Status = structs.EvalStatusComplete
   972  	err = state.UpsertEvals(1003, []*structs.Evaluation{eval2})
   973  	if err != nil {
   974  		t.Fatalf("err: %v", err)
   975  	}
   976  
   977  	// Create a core scheduler
   978  	snap, err = state.Snapshot()
   979  	if err != nil {
   980  		t.Fatalf("err: %v", err)
   981  	}
   982  	core = NewCoreScheduler(s1, snap)
   983  
   984  	// Attempt the GC
   985  	gc = s1.coreJobEval(structs.CoreJobJobGC, 2000)
   986  	err = core.Process(gc)
   987  	if err != nil {
   988  		t.Fatalf("err: %v", err)
   989  	}
   990  
   991  	// Should not still exist
   992  	out, err = state.JobByID(ws, job.Namespace, job.ID)
   993  	if err != nil {
   994  		t.Fatalf("err: %v", err)
   995  	}
   996  	if out != nil {
   997  		t.Fatalf("bad: %v", out)
   998  	}
   999  
  1000  	outE, err = state.EvalByID(ws, eval.ID)
  1001  	if err != nil {
  1002  		t.Fatalf("err: %v", err)
  1003  	}
  1004  	if outE != nil {
  1005  		t.Fatalf("bad: %v", outE)
  1006  	}
  1007  
  1008  	outE2, err = state.EvalByID(ws, eval2.ID)
  1009  	if err != nil {
  1010  		t.Fatalf("err: %v", err)
  1011  	}
  1012  	if outE2 != nil {
  1013  		t.Fatalf("bad: %v", outE2)
  1014  	}
  1015  }
  1016  
  1017  func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) {
  1018  	t.Parallel()
  1019  	s1 := TestServer(t, nil)
  1020  	defer s1.Shutdown()
  1021  	testutil.WaitForLeader(t, s1.RPC)
  1022  
  1023  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1024  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1025  
  1026  	// Insert job.
  1027  	state := s1.fsm.State()
  1028  	job := mock.Job()
  1029  	job.Type = structs.JobTypeBatch
  1030  	job.Status = structs.JobStatusDead
  1031  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
  1032  		Attempts: 0,
  1033  		Interval: 0 * time.Second,
  1034  	}
  1035  	err := state.UpsertJob(1000, job)
  1036  	if err != nil {
  1037  		t.Fatalf("err: %v", err)
  1038  	}
  1039  
  1040  	// Insert an eval
  1041  	eval := mock.Eval()
  1042  	eval.JobID = job.ID
  1043  	eval.Status = structs.EvalStatusComplete
  1044  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
  1045  	if err != nil {
  1046  		t.Fatalf("err: %v", err)
  1047  	}
  1048  
  1049  	// Insert two allocs, one terminal and one not
  1050  	alloc := mock.Alloc()
  1051  	alloc.JobID = job.ID
  1052  	alloc.EvalID = eval.ID
  1053  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
  1054  	alloc.ClientStatus = structs.AllocClientStatusComplete
  1055  	alloc.TaskGroup = job.TaskGroups[0].Name
  1056  
  1057  	alloc2 := mock.Alloc()
  1058  	alloc2.JobID = job.ID
  1059  	alloc2.EvalID = eval.ID
  1060  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
  1061  	alloc2.ClientStatus = structs.AllocClientStatusRunning
  1062  	alloc2.TaskGroup = job.TaskGroups[0].Name
  1063  
  1064  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
  1065  	if err != nil {
  1066  		t.Fatalf("err: %v", err)
  1067  	}
  1068  
  1069  	// Update the time tables to make this work
  1070  	tt := s1.fsm.TimeTable()
  1071  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1072  
  1073  	// Create a core scheduler
  1074  	snap, err := state.Snapshot()
  1075  	if err != nil {
  1076  		t.Fatalf("err: %v", err)
  1077  	}
  1078  	core := NewCoreScheduler(s1, snap)
  1079  
  1080  	// Attempt the GC
  1081  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1082  	err = core.Process(gc)
  1083  	if err != nil {
  1084  		t.Fatalf("err: %v", err)
  1085  	}
  1086  
  1087  	// Should still exist
  1088  	ws := memdb.NewWatchSet()
  1089  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1090  	if err != nil {
  1091  		t.Fatalf("err: %v", err)
  1092  	}
  1093  	if out == nil {
  1094  		t.Fatalf("bad: %v", out)
  1095  	}
  1096  
  1097  	outA, err := state.AllocByID(ws, alloc.ID)
  1098  	if err != nil {
  1099  		t.Fatalf("err: %v", err)
  1100  	}
  1101  	if outA == nil {
  1102  		t.Fatalf("bad: %v", outA)
  1103  	}
  1104  
  1105  	outA2, err := state.AllocByID(ws, alloc2.ID)
  1106  	if err != nil {
  1107  		t.Fatalf("err: %v", err)
  1108  	}
  1109  	if outA2 == nil {
  1110  		t.Fatalf("bad: %v", outA2)
  1111  	}
  1112  
  1113  	// Update the second alloc to be terminal
  1114  	alloc2.ClientStatus = structs.AllocClientStatusComplete
  1115  	err = state.UpsertAllocs(1003, []*structs.Allocation{alloc2})
  1116  	if err != nil {
  1117  		t.Fatalf("err: %v", err)
  1118  	}
  1119  
  1120  	// Create a core scheduler
  1121  	snap, err = state.Snapshot()
  1122  	if err != nil {
  1123  		t.Fatalf("err: %v", err)
  1124  	}
  1125  	core = NewCoreScheduler(s1, snap)
  1126  
  1127  	// Attempt the GC
  1128  	gc = s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1129  	err = core.Process(gc)
  1130  	if err != nil {
  1131  		t.Fatalf("err: %v", err)
  1132  	}
  1133  
  1134  	// Should not still exist
  1135  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1136  	if err != nil {
  1137  		t.Fatalf("err: %v", err)
  1138  	}
  1139  	if out != nil {
  1140  		t.Fatalf("bad: %v", out)
  1141  	}
  1142  
  1143  	outA, err = state.AllocByID(ws, alloc.ID)
  1144  	if err != nil {
  1145  		t.Fatalf("err: %v", err)
  1146  	}
  1147  	if outA != nil {
  1148  		t.Fatalf("bad: %v", outA)
  1149  	}
  1150  
  1151  	outA2, err = state.AllocByID(ws, alloc2.ID)
  1152  	if err != nil {
  1153  		t.Fatalf("err: %v", err)
  1154  	}
  1155  	if outA2 != nil {
  1156  		t.Fatalf("bad: %v", outA2)
  1157  	}
  1158  }
  1159  
  1160  // This test ensures that batch jobs are GC'd in one shot, meaning it all
  1161  // allocs/evals and job or nothing
  1162  func TestCoreScheduler_JobGC_OneShot(t *testing.T) {
  1163  	t.Parallel()
  1164  	s1 := TestServer(t, nil)
  1165  	defer s1.Shutdown()
  1166  	testutil.WaitForLeader(t, s1.RPC)
  1167  
  1168  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1169  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1170  
  1171  	// Insert job.
  1172  	state := s1.fsm.State()
  1173  	job := mock.Job()
  1174  	job.Type = structs.JobTypeBatch
  1175  	err := state.UpsertJob(1000, job)
  1176  	if err != nil {
  1177  		t.Fatalf("err: %v", err)
  1178  	}
  1179  
  1180  	// Insert two complete evals
  1181  	eval := mock.Eval()
  1182  	eval.JobID = job.ID
  1183  	eval.Status = structs.EvalStatusComplete
  1184  
  1185  	eval2 := mock.Eval()
  1186  	eval2.JobID = job.ID
  1187  	eval2.Status = structs.EvalStatusComplete
  1188  
  1189  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2})
  1190  	if err != nil {
  1191  		t.Fatalf("err: %v", err)
  1192  	}
  1193  
  1194  	// Insert one complete alloc and one running on distinct evals
  1195  	alloc := mock.Alloc()
  1196  	alloc.JobID = job.ID
  1197  	alloc.EvalID = eval.ID
  1198  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
  1199  
  1200  	alloc2 := mock.Alloc()
  1201  	alloc2.JobID = job.ID
  1202  	alloc2.EvalID = eval2.ID
  1203  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
  1204  
  1205  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
  1206  	if err != nil {
  1207  		t.Fatalf("err: %v", err)
  1208  	}
  1209  
  1210  	// Force the jobs state to dead
  1211  	job.Status = structs.JobStatusDead
  1212  
  1213  	// Update the time tables to make this work
  1214  	tt := s1.fsm.TimeTable()
  1215  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1216  
  1217  	// Create a core scheduler
  1218  	snap, err := state.Snapshot()
  1219  	if err != nil {
  1220  		t.Fatalf("err: %v", err)
  1221  	}
  1222  	core := NewCoreScheduler(s1, snap)
  1223  
  1224  	// Attempt the GC
  1225  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1226  	err = core.Process(gc)
  1227  	if err != nil {
  1228  		t.Fatalf("err: %v", err)
  1229  	}
  1230  
  1231  	// Should still exist
  1232  	ws := memdb.NewWatchSet()
  1233  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1234  	if err != nil {
  1235  		t.Fatalf("err: %v", err)
  1236  	}
  1237  	if out == nil {
  1238  		t.Fatalf("bad: %v", out)
  1239  	}
  1240  
  1241  	outE, err := state.EvalByID(ws, eval.ID)
  1242  	if err != nil {
  1243  		t.Fatalf("err: %v", err)
  1244  	}
  1245  	if outE == nil {
  1246  		t.Fatalf("bad: %v", outE)
  1247  	}
  1248  
  1249  	outE2, err := state.EvalByID(ws, eval2.ID)
  1250  	if err != nil {
  1251  		t.Fatalf("err: %v", err)
  1252  	}
  1253  	if outE2 == nil {
  1254  		t.Fatalf("bad: %v", outE2)
  1255  	}
  1256  
  1257  	outA, err := state.AllocByID(ws, alloc.ID)
  1258  	if err != nil {
  1259  		t.Fatalf("err: %v", err)
  1260  	}
  1261  	if outA == nil {
  1262  		t.Fatalf("bad: %v", outA)
  1263  	}
  1264  	outA2, err := state.AllocByID(ws, alloc2.ID)
  1265  	if err != nil {
  1266  		t.Fatalf("err: %v", err)
  1267  	}
  1268  	if outA2 == nil {
  1269  		t.Fatalf("bad: %v", outA2)
  1270  	}
  1271  }
  1272  
  1273  // This test ensures that stopped jobs are GCd
  1274  func TestCoreScheduler_JobGC_Stopped(t *testing.T) {
  1275  	t.Parallel()
  1276  	s1 := TestServer(t, nil)
  1277  	defer s1.Shutdown()
  1278  	testutil.WaitForLeader(t, s1.RPC)
  1279  
  1280  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1281  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1282  
  1283  	// Insert job.
  1284  	state := s1.fsm.State()
  1285  	job := mock.Job()
  1286  	job.Stop = true
  1287  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
  1288  		Attempts: 0,
  1289  		Interval: 0 * time.Second,
  1290  	}
  1291  	err := state.UpsertJob(1000, job)
  1292  	if err != nil {
  1293  		t.Fatalf("err: %v", err)
  1294  	}
  1295  
  1296  	// Insert two complete evals
  1297  	eval := mock.Eval()
  1298  	eval.JobID = job.ID
  1299  	eval.Status = structs.EvalStatusComplete
  1300  
  1301  	eval2 := mock.Eval()
  1302  	eval2.JobID = job.ID
  1303  	eval2.Status = structs.EvalStatusComplete
  1304  
  1305  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2})
  1306  	if err != nil {
  1307  		t.Fatalf("err: %v", err)
  1308  	}
  1309  
  1310  	// Insert one complete alloc
  1311  	alloc := mock.Alloc()
  1312  	alloc.JobID = job.ID
  1313  	alloc.EvalID = eval.ID
  1314  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
  1315  	alloc.TaskGroup = job.TaskGroups[0].Name
  1316  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
  1317  	if err != nil {
  1318  		t.Fatalf("err: %v", err)
  1319  	}
  1320  
  1321  	// Update the time tables to make this work
  1322  	tt := s1.fsm.TimeTable()
  1323  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1324  
  1325  	// Create a core scheduler
  1326  	snap, err := state.Snapshot()
  1327  	if err != nil {
  1328  		t.Fatalf("err: %v", err)
  1329  	}
  1330  	core := NewCoreScheduler(s1, snap)
  1331  
  1332  	// Attempt the GC
  1333  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1334  	err = core.Process(gc)
  1335  	if err != nil {
  1336  		t.Fatalf("err: %v", err)
  1337  	}
  1338  
  1339  	// Shouldn't still exist
  1340  	ws := memdb.NewWatchSet()
  1341  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1342  	if err != nil {
  1343  		t.Fatalf("err: %v", err)
  1344  	}
  1345  	if out != nil {
  1346  		t.Fatalf("bad: %v", out)
  1347  	}
  1348  
  1349  	outE, err := state.EvalByID(ws, eval.ID)
  1350  	if err != nil {
  1351  		t.Fatalf("err: %v", err)
  1352  	}
  1353  	if outE != nil {
  1354  		t.Fatalf("bad: %v", outE)
  1355  	}
  1356  
  1357  	outE2, err := state.EvalByID(ws, eval2.ID)
  1358  	if err != nil {
  1359  		t.Fatalf("err: %v", err)
  1360  	}
  1361  	if outE2 != nil {
  1362  		t.Fatalf("bad: %v", outE2)
  1363  	}
  1364  
  1365  	outA, err := state.AllocByID(ws, alloc.ID)
  1366  	if err != nil {
  1367  		t.Fatalf("err: %v", err)
  1368  	}
  1369  	if outA != nil {
  1370  		t.Fatalf("bad: %v", outA)
  1371  	}
  1372  }
  1373  
  1374  func TestCoreScheduler_JobGC_Force(t *testing.T) {
  1375  	t.Parallel()
  1376  	for _, withAcl := range []bool{false, true} {
  1377  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
  1378  			var server *Server
  1379  			if withAcl {
  1380  				server, _ = TestACLServer(t, nil)
  1381  			} else {
  1382  				server = TestServer(t, nil)
  1383  			}
  1384  			defer server.Shutdown()
  1385  			testutil.WaitForLeader(t, server.RPC)
  1386  
  1387  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1388  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1389  
  1390  			// Insert job.
  1391  			state := server.fsm.State()
  1392  			job := mock.Job()
  1393  			job.Type = structs.JobTypeBatch
  1394  			job.Status = structs.JobStatusDead
  1395  			err := state.UpsertJob(1000, job)
  1396  			if err != nil {
  1397  				t.Fatalf("err: %v", err)
  1398  			}
  1399  
  1400  			// Insert a terminal eval
  1401  			eval := mock.Eval()
  1402  			eval.JobID = job.ID
  1403  			eval.Status = structs.EvalStatusComplete
  1404  			err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
  1405  			if err != nil {
  1406  				t.Fatalf("err: %v", err)
  1407  			}
  1408  
  1409  			// Create a core scheduler
  1410  			snap, err := state.Snapshot()
  1411  			if err != nil {
  1412  				t.Fatalf("err: %v", err)
  1413  			}
  1414  			core := NewCoreScheduler(server, snap)
  1415  
  1416  			// Attempt the GC
  1417  			gc := server.coreJobEval(structs.CoreJobForceGC, 1002)
  1418  			err = core.Process(gc)
  1419  			if err != nil {
  1420  				t.Fatalf("err: %v", err)
  1421  			}
  1422  
  1423  			// Shouldn't still exist
  1424  			ws := memdb.NewWatchSet()
  1425  			out, err := state.JobByID(ws, job.Namespace, job.ID)
  1426  			if err != nil {
  1427  				t.Fatalf("err: %v", err)
  1428  			}
  1429  			if out != nil {
  1430  				t.Fatalf("bad: %v", out)
  1431  			}
  1432  
  1433  			outE, err := state.EvalByID(ws, eval.ID)
  1434  			if err != nil {
  1435  				t.Fatalf("err: %v", err)
  1436  			}
  1437  			if outE != nil {
  1438  				t.Fatalf("bad: %v", outE)
  1439  			}
  1440  		})
  1441  	}
  1442  }
  1443  
  1444  // This test ensures parameterized jobs only get gc'd when stopped
  1445  func TestCoreScheduler_JobGC_Parameterized(t *testing.T) {
  1446  	t.Parallel()
  1447  	s1 := TestServer(t, nil)
  1448  	defer s1.Shutdown()
  1449  	testutil.WaitForLeader(t, s1.RPC)
  1450  
  1451  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1452  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1453  
  1454  	// Insert a parameterized job.
  1455  	state := s1.fsm.State()
  1456  	job := mock.Job()
  1457  	job.Type = structs.JobTypeBatch
  1458  	job.Status = structs.JobStatusRunning
  1459  	job.ParameterizedJob = &structs.ParameterizedJobConfig{
  1460  		Payload: structs.DispatchPayloadRequired,
  1461  	}
  1462  	err := state.UpsertJob(1000, job)
  1463  	if err != nil {
  1464  		t.Fatalf("err: %v", err)
  1465  	}
  1466  
  1467  	// Create a core scheduler
  1468  	snap, err := state.Snapshot()
  1469  	if err != nil {
  1470  		t.Fatalf("err: %v", err)
  1471  	}
  1472  	core := NewCoreScheduler(s1, snap)
  1473  
  1474  	// Attempt the GC
  1475  	gc := s1.coreJobEval(structs.CoreJobForceGC, 1002)
  1476  	err = core.Process(gc)
  1477  	if err != nil {
  1478  		t.Fatalf("err: %v", err)
  1479  	}
  1480  
  1481  	// Should still exist
  1482  	ws := memdb.NewWatchSet()
  1483  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1484  	if err != nil {
  1485  		t.Fatalf("err: %v", err)
  1486  	}
  1487  	if out == nil {
  1488  		t.Fatalf("bad: %v", out)
  1489  	}
  1490  
  1491  	// Mark the job as stopped and try again
  1492  	job2 := job.Copy()
  1493  	job2.Stop = true
  1494  	err = state.UpsertJob(2000, job2)
  1495  	if err != nil {
  1496  		t.Fatalf("err: %v", err)
  1497  	}
  1498  
  1499  	// Create a core scheduler
  1500  	snap, err = state.Snapshot()
  1501  	if err != nil {
  1502  		t.Fatalf("err: %v", err)
  1503  	}
  1504  	core = NewCoreScheduler(s1, snap)
  1505  
  1506  	// Attempt the GC
  1507  	gc = s1.coreJobEval(structs.CoreJobForceGC, 2002)
  1508  	err = core.Process(gc)
  1509  	if err != nil {
  1510  		t.Fatalf("err: %v", err)
  1511  	}
  1512  
  1513  	// Should not exist
  1514  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1515  	if err != nil {
  1516  		t.Fatalf("err: %v", err)
  1517  	}
  1518  	if out != nil {
  1519  		t.Fatalf("bad: %+v", out)
  1520  	}
  1521  }
  1522  
  1523  // This test ensures periodic jobs don't get GCd until they are stopped
  1524  func TestCoreScheduler_JobGC_Periodic(t *testing.T) {
  1525  	t.Parallel()
  1526  
  1527  	s1 := TestServer(t, nil)
  1528  	defer s1.Shutdown()
  1529  	testutil.WaitForLeader(t, s1.RPC)
  1530  
  1531  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1532  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1533  
  1534  	// Insert a parameterized job.
  1535  	state := s1.fsm.State()
  1536  	job := mock.PeriodicJob()
  1537  	err := state.UpsertJob(1000, job)
  1538  	if err != nil {
  1539  		t.Fatalf("err: %v", err)
  1540  	}
  1541  
  1542  	// Create a core scheduler
  1543  	snap, err := state.Snapshot()
  1544  	if err != nil {
  1545  		t.Fatalf("err: %v", err)
  1546  	}
  1547  	core := NewCoreScheduler(s1, snap)
  1548  
  1549  	// Attempt the GC
  1550  	gc := s1.coreJobEval(structs.CoreJobForceGC, 1002)
  1551  	err = core.Process(gc)
  1552  	if err != nil {
  1553  		t.Fatalf("err: %v", err)
  1554  	}
  1555  
  1556  	// Should still exist
  1557  	ws := memdb.NewWatchSet()
  1558  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1559  	if err != nil {
  1560  		t.Fatalf("err: %v", err)
  1561  	}
  1562  	if out == nil {
  1563  		t.Fatalf("bad: %v", out)
  1564  	}
  1565  
  1566  	// Mark the job as stopped and try again
  1567  	job2 := job.Copy()
  1568  	job2.Stop = true
  1569  	err = state.UpsertJob(2000, job2)
  1570  	if err != nil {
  1571  		t.Fatalf("err: %v", err)
  1572  	}
  1573  
  1574  	// Create a core scheduler
  1575  	snap, err = state.Snapshot()
  1576  	if err != nil {
  1577  		t.Fatalf("err: %v", err)
  1578  	}
  1579  	core = NewCoreScheduler(s1, snap)
  1580  
  1581  	// Attempt the GC
  1582  	gc = s1.coreJobEval(structs.CoreJobForceGC, 2002)
  1583  	err = core.Process(gc)
  1584  	if err != nil {
  1585  		t.Fatalf("err: %v", err)
  1586  	}
  1587  
  1588  	// Should not exist
  1589  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1590  	if err != nil {
  1591  		t.Fatalf("err: %v", err)
  1592  	}
  1593  	if out != nil {
  1594  		t.Fatalf("bad: %+v", out)
  1595  	}
  1596  }
  1597  
  1598  func TestCoreScheduler_DeploymentGC(t *testing.T) {
  1599  	t.Parallel()
  1600  	s1 := TestServer(t, nil)
  1601  	defer s1.Shutdown()
  1602  	testutil.WaitForLeader(t, s1.RPC)
  1603  	assert := assert.New(t)
  1604  
  1605  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1606  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1607  
  1608  	// Insert an active, terminal, and terminal with allocations deployment
  1609  	state := s1.fsm.State()
  1610  	d1, d2, d3 := mock.Deployment(), mock.Deployment(), mock.Deployment()
  1611  	d1.Status = structs.DeploymentStatusFailed
  1612  	d3.Status = structs.DeploymentStatusSuccessful
  1613  	assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment")
  1614  	assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment")
  1615  	assert.Nil(state.UpsertDeployment(1002, d3), "UpsertDeployment")
  1616  
  1617  	a := mock.Alloc()
  1618  	a.JobID = d3.JobID
  1619  	a.DeploymentID = d3.ID
  1620  	assert.Nil(state.UpsertAllocs(1003, []*structs.Allocation{a}), "UpsertAllocs")
  1621  
  1622  	// Update the time tables to make this work
  1623  	tt := s1.fsm.TimeTable()
  1624  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.DeploymentGCThreshold))
  1625  
  1626  	// Create a core scheduler
  1627  	snap, err := state.Snapshot()
  1628  	assert.Nil(err, "Snapshot")
  1629  	core := NewCoreScheduler(s1, snap)
  1630  
  1631  	// Attempt the GC
  1632  	gc := s1.coreJobEval(structs.CoreJobDeploymentGC, 2000)
  1633  	assert.Nil(core.Process(gc), "Process GC")
  1634  
  1635  	// Should be gone
  1636  	ws := memdb.NewWatchSet()
  1637  	out, err := state.DeploymentByID(ws, d1.ID)
  1638  	assert.Nil(err, "DeploymentByID")
  1639  	assert.Nil(out, "Terminal Deployment")
  1640  	out2, err := state.DeploymentByID(ws, d2.ID)
  1641  	assert.Nil(err, "DeploymentByID")
  1642  	assert.NotNil(out2, "Active Deployment")
  1643  	out3, err := state.DeploymentByID(ws, d3.ID)
  1644  	assert.Nil(err, "DeploymentByID")
  1645  	assert.NotNil(out3, "Terminal Deployment With Allocs")
  1646  }
  1647  
  1648  func TestCoreScheduler_DeploymentGC_Force(t *testing.T) {
  1649  	t.Parallel()
  1650  	for _, withAcl := range []bool{false, true} {
  1651  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
  1652  			var server *Server
  1653  			if withAcl {
  1654  				server, _ = TestACLServer(t, nil)
  1655  			} else {
  1656  				server = TestServer(t, nil)
  1657  			}
  1658  			defer server.Shutdown()
  1659  			testutil.WaitForLeader(t, server.RPC)
  1660  			assert := assert.New(t)
  1661  
  1662  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1663  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1664  
  1665  			// Insert terminal and active deployment
  1666  			state := server.fsm.State()
  1667  			d1, d2 := mock.Deployment(), mock.Deployment()
  1668  			d1.Status = structs.DeploymentStatusFailed
  1669  			assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment")
  1670  			assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment")
  1671  
  1672  			// Create a core scheduler
  1673  			snap, err := state.Snapshot()
  1674  			assert.Nil(err, "Snapshot")
  1675  			core := NewCoreScheduler(server, snap)
  1676  
  1677  			// Attempt the GC
  1678  			gc := server.coreJobEval(structs.CoreJobForceGC, 1000)
  1679  			assert.Nil(core.Process(gc), "Process Force GC")
  1680  
  1681  			// Should be gone
  1682  			ws := memdb.NewWatchSet()
  1683  			out, err := state.DeploymentByID(ws, d1.ID)
  1684  			assert.Nil(err, "DeploymentByID")
  1685  			assert.Nil(out, "Terminal Deployment")
  1686  			out2, err := state.DeploymentByID(ws, d2.ID)
  1687  			assert.Nil(err, "DeploymentByID")
  1688  			assert.NotNil(out2, "Active Deployment")
  1689  		})
  1690  	}
  1691  }
  1692  
  1693  func TestCoreScheduler_PartitionEvalReap(t *testing.T) {
  1694  	t.Parallel()
  1695  	s1 := TestServer(t, nil)
  1696  	defer s1.Shutdown()
  1697  	testutil.WaitForLeader(t, s1.RPC)
  1698  
  1699  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1700  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1701  
  1702  	// Create a core scheduler
  1703  	snap, err := s1.fsm.State().Snapshot()
  1704  	if err != nil {
  1705  		t.Fatalf("err: %v", err)
  1706  	}
  1707  	core := NewCoreScheduler(s1, snap)
  1708  
  1709  	// Set the max ids per reap to something lower.
  1710  	maxIdsPerReap = 2
  1711  
  1712  	evals := []string{"a", "b", "c"}
  1713  	allocs := []string{"1", "2", "3"}
  1714  	requests := core.(*CoreScheduler).partitionEvalReap(evals, allocs)
  1715  	if len(requests) != 3 {
  1716  		t.Fatalf("Expected 3 requests got: %v", requests)
  1717  	}
  1718  
  1719  	first := requests[0]
  1720  	if len(first.Allocs) != 2 && len(first.Evals) != 0 {
  1721  		t.Fatalf("Unexpected first request: %v", first)
  1722  	}
  1723  
  1724  	second := requests[1]
  1725  	if len(second.Allocs) != 1 && len(second.Evals) != 1 {
  1726  		t.Fatalf("Unexpected second request: %v", second)
  1727  	}
  1728  
  1729  	third := requests[2]
  1730  	if len(third.Allocs) != 0 && len(third.Evals) != 2 {
  1731  		t.Fatalf("Unexpected third request: %v", third)
  1732  	}
  1733  }
  1734  
  1735  func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) {
  1736  	t.Parallel()
  1737  	s1 := TestServer(t, nil)
  1738  	defer s1.Shutdown()
  1739  	testutil.WaitForLeader(t, s1.RPC)
  1740  
  1741  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1742  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1743  
  1744  	// Create a core scheduler
  1745  	snap, err := s1.fsm.State().Snapshot()
  1746  	if err != nil {
  1747  		t.Fatalf("err: %v", err)
  1748  	}
  1749  	core := NewCoreScheduler(s1, snap)
  1750  
  1751  	// Set the max ids per reap to something lower.
  1752  	maxIdsPerReap = 2
  1753  
  1754  	deployments := []string{"a", "b", "c"}
  1755  	requests := core.(*CoreScheduler).partitionDeploymentReap(deployments)
  1756  	if len(requests) != 2 {
  1757  		t.Fatalf("Expected 2 requests got: %v", requests)
  1758  	}
  1759  
  1760  	first := requests[0]
  1761  	if len(first.Deployments) != 2 {
  1762  		t.Fatalf("Unexpected first request: %v", first)
  1763  	}
  1764  
  1765  	second := requests[1]
  1766  	if len(second.Deployments) != 1 {
  1767  		t.Fatalf("Unexpected second request: %v", second)
  1768  	}
  1769  }
  1770  
  1771  func TestCoreScheduler_PartitionJobReap(t *testing.T) {
  1772  	t.Parallel()
  1773  	require := require.New(t)
  1774  	s1 := TestServer(t, nil)
  1775  	defer s1.Shutdown()
  1776  	testutil.WaitForLeader(t, s1.RPC)
  1777  
  1778  	// Create a core scheduler
  1779  	snap, err := s1.fsm.State().Snapshot()
  1780  	if err != nil {
  1781  		t.Fatalf("err: %v", err)
  1782  	}
  1783  	core := NewCoreScheduler(s1, snap)
  1784  
  1785  	// Set the max ids per reap to something lower.
  1786  	maxIdsPerReap = 2
  1787  
  1788  	jobs := []*structs.Job{mock.Job(), mock.Job(), mock.Job()}
  1789  	requests := core.(*CoreScheduler).partitionJobReap(jobs, "")
  1790  	require.Len(requests, 2)
  1791  
  1792  	first := requests[0]
  1793  	second := requests[1]
  1794  	require.Len(first.Jobs, 2)
  1795  	require.Len(second.Jobs, 1)
  1796  }
  1797  
  1798  // Tests various scenarios when allocations are eligible to be GCed
  1799  func TestAllocation_GCEligible(t *testing.T) {
  1800  	type testCase struct {
  1801  		Desc               string
  1802  		GCTime             time.Time
  1803  		ClientStatus       string
  1804  		DesiredStatus      string
  1805  		JobStatus          string
  1806  		JobStop            bool
  1807  		ModifyIndex        uint64
  1808  		NextAllocID        string
  1809  		ReschedulePolicy   *structs.ReschedulePolicy
  1810  		RescheduleTrackers []*structs.RescheduleEvent
  1811  		ThresholdIndex     uint64
  1812  		ShouldGC           bool
  1813  	}
  1814  
  1815  	fail := time.Now()
  1816  
  1817  	harness := []testCase{
  1818  		{
  1819  			Desc:           "Don't GC when non terminal",
  1820  			ClientStatus:   structs.AllocClientStatusPending,
  1821  			DesiredStatus:  structs.AllocDesiredStatusRun,
  1822  			GCTime:         fail,
  1823  			ModifyIndex:    90,
  1824  			ThresholdIndex: 90,
  1825  			ShouldGC:       false,
  1826  		},
  1827  		{
  1828  			Desc:           "Don't GC when non terminal and job stopped",
  1829  			ClientStatus:   structs.AllocClientStatusPending,
  1830  			DesiredStatus:  structs.AllocDesiredStatusRun,
  1831  			JobStop:        true,
  1832  			GCTime:         fail,
  1833  			ModifyIndex:    90,
  1834  			ThresholdIndex: 90,
  1835  			ShouldGC:       false,
  1836  		},
  1837  		{
  1838  			Desc:           "Don't GC when non terminal and job dead",
  1839  			ClientStatus:   structs.AllocClientStatusPending,
  1840  			DesiredStatus:  structs.AllocDesiredStatusRun,
  1841  			JobStatus:      structs.JobStatusDead,
  1842  			GCTime:         fail,
  1843  			ModifyIndex:    90,
  1844  			ThresholdIndex: 90,
  1845  			ShouldGC:       false,
  1846  		},
  1847  		{
  1848  			Desc:             "GC when terminal but not failed ",
  1849  			ClientStatus:     structs.AllocClientStatusComplete,
  1850  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1851  			GCTime:           fail,
  1852  			ModifyIndex:      90,
  1853  			ThresholdIndex:   90,
  1854  			ReschedulePolicy: nil,
  1855  			ShouldGC:         true,
  1856  		},
  1857  		{
  1858  			Desc:             "Don't GC when threshold not met",
  1859  			ClientStatus:     structs.AllocClientStatusComplete,
  1860  			DesiredStatus:    structs.AllocDesiredStatusStop,
  1861  			GCTime:           fail,
  1862  			ModifyIndex:      100,
  1863  			ThresholdIndex:   90,
  1864  			ReschedulePolicy: nil,
  1865  			ShouldGC:         false,
  1866  		},
  1867  		{
  1868  			Desc:             "GC when no reschedule policy",
  1869  			ClientStatus:     structs.AllocClientStatusFailed,
  1870  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1871  			GCTime:           fail,
  1872  			ReschedulePolicy: nil,
  1873  			ModifyIndex:      90,
  1874  			ThresholdIndex:   90,
  1875  			ShouldGC:         true,
  1876  		},
  1877  		{
  1878  			Desc:             "GC when empty policy",
  1879  			ClientStatus:     structs.AllocClientStatusFailed,
  1880  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1881  			GCTime:           fail,
  1882  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 0, Interval: 0 * time.Minute},
  1883  			ModifyIndex:      90,
  1884  			ThresholdIndex:   90,
  1885  			ShouldGC:         true,
  1886  		},
  1887  		{
  1888  			Desc:             "Don't GC when no previous reschedule attempts",
  1889  			ClientStatus:     structs.AllocClientStatusFailed,
  1890  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1891  			GCTime:           fail,
  1892  			ModifyIndex:      90,
  1893  			ThresholdIndex:   90,
  1894  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 1, Interval: 1 * time.Minute},
  1895  			ShouldGC:         false,
  1896  		},
  1897  		{
  1898  			Desc:             "Don't GC when prev reschedule attempt within interval",
  1899  			ClientStatus:     structs.AllocClientStatusFailed,
  1900  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1901  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute},
  1902  			GCTime:           fail,
  1903  			ModifyIndex:      90,
  1904  			ThresholdIndex:   90,
  1905  			RescheduleTrackers: []*structs.RescheduleEvent{
  1906  				{
  1907  					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
  1908  				},
  1909  			},
  1910  			ShouldGC: false,
  1911  		},
  1912  		{
  1913  			Desc:             "GC with prev reschedule attempt outside interval",
  1914  			ClientStatus:     structs.AllocClientStatusFailed,
  1915  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1916  			GCTime:           fail,
  1917  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  1918  			RescheduleTrackers: []*structs.RescheduleEvent{
  1919  				{
  1920  					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
  1921  				},
  1922  				{
  1923  					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
  1924  				},
  1925  			},
  1926  			ShouldGC: true,
  1927  		},
  1928  		{
  1929  			Desc:             "GC when next alloc id is set",
  1930  			ClientStatus:     structs.AllocClientStatusFailed,
  1931  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1932  			GCTime:           fail,
  1933  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  1934  			RescheduleTrackers: []*structs.RescheduleEvent{
  1935  				{
  1936  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  1937  				},
  1938  			},
  1939  			NextAllocID: uuid.Generate(),
  1940  			ShouldGC:    true,
  1941  		},
  1942  		{
  1943  			Desc:             "Don't GC when next alloc id is not set and unlimited restarts",
  1944  			ClientStatus:     structs.AllocClientStatusFailed,
  1945  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1946  			GCTime:           fail,
  1947  			ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"},
  1948  			RescheduleTrackers: []*structs.RescheduleEvent{
  1949  				{
  1950  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  1951  				},
  1952  			},
  1953  			ShouldGC: false,
  1954  		},
  1955  		{
  1956  			Desc:             "GC when job is stopped",
  1957  			ClientStatus:     structs.AllocClientStatusFailed,
  1958  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1959  			GCTime:           fail,
  1960  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  1961  			RescheduleTrackers: []*structs.RescheduleEvent{
  1962  				{
  1963  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  1964  				},
  1965  			},
  1966  			JobStop:  true,
  1967  			ShouldGC: true,
  1968  		},
  1969  		{
  1970  			Desc:             "GC when job status is dead",
  1971  			ClientStatus:     structs.AllocClientStatusFailed,
  1972  			DesiredStatus:    structs.AllocDesiredStatusRun,
  1973  			GCTime:           fail,
  1974  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  1975  			RescheduleTrackers: []*structs.RescheduleEvent{
  1976  				{
  1977  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  1978  				},
  1979  			},
  1980  			JobStatus: structs.JobStatusDead,
  1981  			ShouldGC:  true,
  1982  		},
  1983  		{
  1984  			Desc:             "GC when desired status is stop, unlimited reschedule policy, no previous reschedule events",
  1985  			ClientStatus:     structs.AllocClientStatusFailed,
  1986  			DesiredStatus:    structs.AllocDesiredStatusStop,
  1987  			GCTime:           fail,
  1988  			ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"},
  1989  			ShouldGC:         true,
  1990  		},
  1991  		{
  1992  			Desc:             "GC when desired status is stop, limited reschedule policy, some previous reschedule events",
  1993  			ClientStatus:     structs.AllocClientStatusFailed,
  1994  			DesiredStatus:    structs.AllocDesiredStatusStop,
  1995  			GCTime:           fail,
  1996  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  1997  			RescheduleTrackers: []*structs.RescheduleEvent{
  1998  				{
  1999  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  2000  				},
  2001  			},
  2002  			ShouldGC: true,
  2003  		},
  2004  	}
  2005  
  2006  	for _, tc := range harness {
  2007  		alloc := &structs.Allocation{}
  2008  		alloc.ModifyIndex = tc.ModifyIndex
  2009  		alloc.DesiredStatus = tc.DesiredStatus
  2010  		alloc.ClientStatus = tc.ClientStatus
  2011  		alloc.RescheduleTracker = &structs.RescheduleTracker{Events: tc.RescheduleTrackers}
  2012  		alloc.NextAllocation = tc.NextAllocID
  2013  		job := mock.Job()
  2014  		alloc.TaskGroup = job.TaskGroups[0].Name
  2015  		job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
  2016  		if tc.JobStatus != "" {
  2017  			job.Status = tc.JobStatus
  2018  		}
  2019  		job.Stop = tc.JobStop
  2020  
  2021  		t.Run(tc.Desc, func(t *testing.T) {
  2022  			if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
  2023  				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
  2024  			}
  2025  		})
  2026  
  2027  	}
  2028  
  2029  	// Verify nil job
  2030  	require := require.New(t)
  2031  	alloc := mock.Alloc()
  2032  	alloc.ClientStatus = structs.AllocClientStatusComplete
  2033  	require.True(allocGCEligible(alloc, nil, time.Now(), 1000))
  2034  }