gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/core_sched_test.go

gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/core_sched_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"testing"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/nomad/mock"
    12  	"github.com/hashicorp/nomad/nomad/state"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"github.com/hashicorp/nomad/testutil"
    15  	"github.com/stretchr/testify/assert"
    16  	"github.com/stretchr/testify/require"
    17  )
    18  
    19  func TestCoreScheduler_EvalGC(t *testing.T) {
    20  	t.Parallel()
    21  
    22  	s1, cleanupS1 := TestServer(t, nil)
    23  	defer cleanupS1()
    24  	testutil.WaitForLeader(t, s1.RPC)
    25  	require := require.New(t)
    26  
    27  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
    28  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
    29  
    30  	// Insert "dead" eval
    31  	state := s1.fsm.State()
    32  	eval := mock.Eval()
    33  	eval.Status = structs.EvalStatusFailed
    34  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
    35  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
    36  	require.Nil(err)
    37  
    38  	// Insert mock job with rescheduling disabled
    39  	job := mock.Job()
    40  	job.ID = eval.JobID
    41  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
    42  		Attempts: 0,
    43  		Interval: 0 * time.Second,
    44  	}
    45  	err = state.UpsertJob(1001, job)
    46  	require.Nil(err)
    47  
    48  	// Insert "dead" alloc
    49  	alloc := mock.Alloc()
    50  	alloc.EvalID = eval.ID
    51  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
    52  	alloc.JobID = eval.JobID
    53  	alloc.TaskGroup = job.TaskGroups[0].Name
    54  
    55  	// Insert "lost" alloc
    56  	alloc2 := mock.Alloc()
    57  	alloc2.EvalID = eval.ID
    58  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
    59  	alloc2.ClientStatus = structs.AllocClientStatusLost
    60  	alloc2.JobID = eval.JobID
    61  	alloc2.TaskGroup = job.TaskGroups[0].Name
    62  	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
    63  	if err != nil {
    64  		t.Fatalf("err: %v", err)
    65  	}
    66  
    67  	// Update the time tables to make this work
    68  	tt := s1.fsm.TimeTable()
    69  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
    70  
    71  	// Create a core scheduler
    72  	snap, err := state.Snapshot()
    73  	if err != nil {
    74  		t.Fatalf("err: %v", err)
    75  	}
    76  	core := NewCoreScheduler(s1, snap)
    77  
    78  	// Attempt the GC
    79  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
    80  	err = core.Process(gc)
    81  	if err != nil {
    82  		t.Fatalf("err: %v", err)
    83  	}
    84  
    85  	// Should be gone
    86  	ws := memdb.NewWatchSet()
    87  	out, err := state.EvalByID(ws, eval.ID)
    88  	if err != nil {
    89  		t.Fatalf("err: %v", err)
    90  	}
    91  	if out != nil {
    92  		t.Fatalf("bad: %v", out)
    93  	}
    94  
    95  	outA, err := state.AllocByID(ws, alloc.ID)
    96  	if err != nil {
    97  		t.Fatalf("err: %v", err)
    98  	}
    99  	if outA != nil {
   100  		t.Fatalf("bad: %v", outA)
   101  	}
   102  
   103  	outA2, err := state.AllocByID(ws, alloc2.ID)
   104  	if err != nil {
   105  		t.Fatalf("err: %v", err)
   106  	}
   107  	if outA2 != nil {
   108  		t.Fatalf("bad: %v", outA2)
   109  	}
   110  }
   111  
   112  // Tests GC behavior on allocations being rescheduled
   113  func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
   114  	t.Parallel()
   115  
   116  	s1, cleanupS1 := TestServer(t, nil)
   117  	defer cleanupS1()
   118  	testutil.WaitForLeader(t, s1.RPC)
   119  	require := require.New(t)
   120  
   121  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   122  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   123  
   124  	// Insert "dead" eval
   125  	state := s1.fsm.State()
   126  	eval := mock.Eval()
   127  	eval.Status = structs.EvalStatusFailed
   128  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   129  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   130  	require.Nil(err)
   131  
   132  	// Insert "pending" eval for same job
   133  	eval2 := mock.Eval()
   134  	eval2.JobID = eval.JobID
   135  	state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID))
   136  	err = state.UpsertEvals(1003, []*structs.Evaluation{eval2})
   137  	require.Nil(err)
   138  
   139  	// Insert mock job with default reschedule policy of 2 in 10 minutes
   140  	job := mock.Job()
   141  	job.ID = eval.JobID
   142  
   143  	err = state.UpsertJob(1001, job)
   144  	require.Nil(err)
   145  
   146  	// Insert failed alloc with an old reschedule attempt, can be GCed
   147  	alloc := mock.Alloc()
   148  	alloc.Job = job
   149  	alloc.EvalID = eval.ID
   150  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
   151  	alloc.ClientStatus = structs.AllocClientStatusFailed
   152  	alloc.JobID = eval.JobID
   153  	alloc.TaskGroup = job.TaskGroups[0].Name
   154  	alloc.NextAllocation = uuid.Generate()
   155  	alloc.RescheduleTracker = &structs.RescheduleTracker{
   156  		Events: []*structs.RescheduleEvent{
   157  			{
   158  				RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
   159  				PrevNodeID:     uuid.Generate(),
   160  				PrevAllocID:    uuid.Generate(),
   161  			},
   162  		},
   163  	}
   164  
   165  	alloc2 := mock.Alloc()
   166  	alloc2.Job = job
   167  	alloc2.EvalID = eval.ID
   168  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   169  	alloc2.ClientStatus = structs.AllocClientStatusFailed
   170  	alloc2.JobID = eval.JobID
   171  	alloc2.TaskGroup = job.TaskGroups[0].Name
   172  	alloc2.RescheduleTracker = &structs.RescheduleTracker{
   173  		Events: []*structs.RescheduleEvent{
   174  			{
   175  				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
   176  				PrevNodeID:     uuid.Generate(),
   177  				PrevAllocID:    uuid.Generate(),
   178  			},
   179  		},
   180  	}
   181  	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
   182  	require.Nil(err)
   183  
   184  	// Update the time tables to make this work
   185  	tt := s1.fsm.TimeTable()
   186  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   187  
   188  	// Create a core scheduler
   189  	snap, err := state.Snapshot()
   190  	if err != nil {
   191  		t.Fatalf("err: %v", err)
   192  	}
   193  	core := NewCoreScheduler(s1, snap)
   194  
   195  	// Attempt the GC, job has all terminal allocs and one pending eval
   196  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   197  	err = core.Process(gc)
   198  	require.Nil(err)
   199  
   200  	// Eval should still exist
   201  	ws := memdb.NewWatchSet()
   202  	out, err := state.EvalByID(ws, eval.ID)
   203  	require.Nil(err)
   204  	require.NotNil(out)
   205  	require.Equal(eval.ID, out.ID)
   206  
   207  	outA, err := state.AllocByID(ws, alloc.ID)
   208  	require.Nil(err)
   209  	require.Nil(outA)
   210  
   211  	outA2, err := state.AllocByID(ws, alloc2.ID)
   212  	require.Nil(err)
   213  	require.Equal(alloc2.ID, outA2.ID)
   214  
   215  }
   216  
   217  // Tests GC behavior on stopped job with reschedulable allocs
   218  func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) {
   219  	t.Parallel()
   220  
   221  	s1, cleanupS1 := TestServer(t, nil)
   222  	defer cleanupS1()
   223  	testutil.WaitForLeader(t, s1.RPC)
   224  	require := require.New(t)
   225  
   226  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   227  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   228  
   229  	// Insert "dead" eval
   230  	state := s1.fsm.State()
   231  	eval := mock.Eval()
   232  	eval.Status = structs.EvalStatusFailed
   233  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   234  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   235  	require.Nil(err)
   236  
   237  	// Insert mock stopped job with default reschedule policy of 2 in 10 minutes
   238  	job := mock.Job()
   239  	job.ID = eval.JobID
   240  	job.Stop = true
   241  
   242  	err = state.UpsertJob(1001, job)
   243  	require.Nil(err)
   244  
   245  	// Insert failed alloc with a recent reschedule attempt
   246  	alloc := mock.Alloc()
   247  	alloc.EvalID = eval.ID
   248  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
   249  	alloc.ClientStatus = structs.AllocClientStatusLost
   250  	alloc.JobID = eval.JobID
   251  	alloc.TaskGroup = job.TaskGroups[0].Name
   252  	alloc.RescheduleTracker = &structs.RescheduleTracker{
   253  		Events: []*structs.RescheduleEvent{
   254  			{
   255  				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
   256  				PrevNodeID:     uuid.Generate(),
   257  				PrevAllocID:    uuid.Generate(),
   258  			},
   259  		},
   260  	}
   261  	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc})
   262  	require.Nil(err)
   263  
   264  	// Update the time tables to make this work
   265  	tt := s1.fsm.TimeTable()
   266  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   267  
   268  	// Create a core scheduler
   269  	snap, err := state.Snapshot()
   270  	if err != nil {
   271  		t.Fatalf("err: %v", err)
   272  	}
   273  	core := NewCoreScheduler(s1, snap)
   274  
   275  	// Attempt the GC
   276  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   277  	err = core.Process(gc)
   278  	require.Nil(err)
   279  
   280  	// Eval should not exist
   281  	ws := memdb.NewWatchSet()
   282  	out, err := state.EvalByID(ws, eval.ID)
   283  	require.Nil(err)
   284  	require.Nil(out)
   285  
   286  	// Alloc should not exist
   287  	outA, err := state.AllocByID(ws, alloc.ID)
   288  	require.Nil(err)
   289  	require.Nil(outA)
   290  
   291  }
   292  
   293  // An EvalGC should never reap a batch job that has not been stopped
   294  func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
   295  	t.Parallel()
   296  
   297  	s1, cleanupS1 := TestServer(t, nil)
   298  	defer cleanupS1()
   299  	testutil.WaitForLeader(t, s1.RPC)
   300  
   301  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   302  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   303  
   304  	// Insert a "dead" job
   305  	state := s1.fsm.State()
   306  	job := mock.Job()
   307  	job.Type = structs.JobTypeBatch
   308  	job.Status = structs.JobStatusDead
   309  	err := state.UpsertJob(1000, job)
   310  	if err != nil {
   311  		t.Fatalf("err: %v", err)
   312  	}
   313  
   314  	// Insert "complete" eval
   315  	eval := mock.Eval()
   316  	eval.Status = structs.EvalStatusComplete
   317  	eval.Type = structs.JobTypeBatch
   318  	eval.JobID = job.ID
   319  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
   320  	if err != nil {
   321  		t.Fatalf("err: %v", err)
   322  	}
   323  
   324  	// Insert "failed" alloc
   325  	alloc := mock.Alloc()
   326  	alloc.Job = job
   327  	alloc.JobID = job.ID
   328  	alloc.EvalID = eval.ID
   329  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   330  
   331  	// Insert "lost" alloc
   332  	alloc2 := mock.Alloc()
   333  	alloc2.Job = job
   334  	alloc2.JobID = job.ID
   335  	alloc2.EvalID = eval.ID
   336  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   337  	alloc2.ClientStatus = structs.AllocClientStatusLost
   338  
   339  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
   340  	if err != nil {
   341  		t.Fatalf("err: %v", err)
   342  	}
   343  
   344  	// Update the time tables to make this work
   345  	tt := s1.fsm.TimeTable()
   346  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   347  
   348  	// Create a core scheduler
   349  	snap, err := state.Snapshot()
   350  	if err != nil {
   351  		t.Fatalf("err: %v", err)
   352  	}
   353  	core := NewCoreScheduler(s1, snap)
   354  
   355  	// Attempt the GC
   356  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   357  	err = core.Process(gc)
   358  	if err != nil {
   359  		t.Fatalf("err: %v", err)
   360  	}
   361  
   362  	// Nothing should be gone
   363  	ws := memdb.NewWatchSet()
   364  	out, err := state.EvalByID(ws, eval.ID)
   365  	if err != nil {
   366  		t.Fatalf("err: %v", err)
   367  	}
   368  	if out == nil {
   369  		t.Fatalf("bad: %v", out)
   370  	}
   371  
   372  	outA, err := state.AllocByID(ws, alloc.ID)
   373  	if err != nil {
   374  		t.Fatalf("err: %v", err)
   375  	}
   376  	if outA == nil {
   377  		t.Fatalf("bad: %v", outA)
   378  	}
   379  
   380  	outA2, err := state.AllocByID(ws, alloc2.ID)
   381  	if err != nil {
   382  		t.Fatalf("err: %v", err)
   383  	}
   384  	if outA2 == nil {
   385  		t.Fatalf("bad: %v", outA2)
   386  	}
   387  
   388  	outB, err := state.JobByID(ws, job.Namespace, job.ID)
   389  	if err != nil {
   390  		t.Fatalf("err: %v", err)
   391  	}
   392  	if outB == nil {
   393  		t.Fatalf("bad: %v", outB)
   394  	}
   395  }
   396  
   397  // An EvalGC should reap allocations from jobs with an older modify index
   398  func TestCoreScheduler_EvalGC_Batch_OldVersion(t *testing.T) {
   399  	t.Parallel()
   400  
   401  	s1, cleanupS1 := TestServer(t, nil)
   402  	defer cleanupS1()
   403  	testutil.WaitForLeader(t, s1.RPC)
   404  
   405  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   406  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   407  
   408  	// Insert a "dead" job
   409  	state := s1.fsm.State()
   410  	job := mock.Job()
   411  	job.Type = structs.JobTypeBatch
   412  	job.Status = structs.JobStatusDead
   413  	err := state.UpsertJob(1000, job)
   414  	if err != nil {
   415  		t.Fatalf("err: %v", err)
   416  	}
   417  
   418  	// Insert "complete" eval
   419  	eval := mock.Eval()
   420  	eval.Status = structs.EvalStatusComplete
   421  	eval.Type = structs.JobTypeBatch
   422  	eval.JobID = job.ID
   423  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
   424  	if err != nil {
   425  		t.Fatalf("err: %v", err)
   426  	}
   427  
   428  	// Insert "failed" alloc
   429  	alloc := mock.Alloc()
   430  	alloc.Job = job
   431  	alloc.JobID = job.ID
   432  	alloc.EvalID = eval.ID
   433  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   434  
   435  	// Insert "lost" alloc
   436  	alloc2 := mock.Alloc()
   437  	alloc2.Job = job
   438  	alloc2.JobID = job.ID
   439  	alloc2.EvalID = eval.ID
   440  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   441  	alloc2.ClientStatus = structs.AllocClientStatusLost
   442  
   443  	// Insert alloc with older job modifyindex
   444  	alloc3 := mock.Alloc()
   445  	job2 := job.Copy()
   446  
   447  	alloc3.Job = job2
   448  	alloc3.JobID = job2.ID
   449  	alloc3.EvalID = eval.ID
   450  	job2.CreateIndex = 500
   451  	alloc3.DesiredStatus = structs.AllocDesiredStatusRun
   452  	alloc3.ClientStatus = structs.AllocClientStatusLost
   453  
   454  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2, alloc3})
   455  	if err != nil {
   456  		t.Fatalf("err: %v", err)
   457  	}
   458  
   459  	// Update the time tables to make this work
   460  	tt := s1.fsm.TimeTable()
   461  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   462  
   463  	// Create a core scheduler
   464  	snap, err := state.Snapshot()
   465  	if err != nil {
   466  		t.Fatalf("err: %v", err)
   467  	}
   468  	core := NewCoreScheduler(s1, snap)
   469  
   470  	// Attempt the GC
   471  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   472  	err = core.Process(gc)
   473  	if err != nil {
   474  		t.Fatalf("err: %v", err)
   475  	}
   476  
   477  	// Alloc1 and 2 should be there, and alloc3 should be gone
   478  	ws := memdb.NewWatchSet()
   479  	out, err := state.EvalByID(ws, eval.ID)
   480  	if err != nil {
   481  		t.Fatalf("err: %v", err)
   482  	}
   483  	if out == nil {
   484  		t.Fatalf("bad: %v", out)
   485  	}
   486  
   487  	outA, err := state.AllocByID(ws, alloc.ID)
   488  	if err != nil {
   489  		t.Fatalf("err: %v", err)
   490  	}
   491  	if outA == nil {
   492  		t.Fatalf("bad: %v", outA)
   493  	}
   494  
   495  	outA2, err := state.AllocByID(ws, alloc2.ID)
   496  	if err != nil {
   497  		t.Fatalf("err: %v", err)
   498  	}
   499  	if outA2 == nil {
   500  		t.Fatalf("bad: %v", outA2)
   501  	}
   502  
   503  	outA3, err := state.AllocByID(ws, alloc3.ID)
   504  	if err != nil {
   505  		t.Fatalf("err: %v", err)
   506  	}
   507  	if outA3 != nil {
   508  		t.Fatalf("expected alloc to be nil:%v", outA2)
   509  	}
   510  
   511  	outB, err := state.JobByID(ws, job.Namespace, job.ID)
   512  	if err != nil {
   513  		t.Fatalf("err: %v", err)
   514  	}
   515  	if outB == nil {
   516  		t.Fatalf("bad: %v", outB)
   517  	}
   518  }
   519  
   520  // An EvalGC should  reap a batch job that has been stopped
   521  func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
   522  	t.Parallel()
   523  
   524  	s1, cleanupS1 := TestServer(t, nil)
   525  	defer cleanupS1()
   526  	testutil.WaitForLeader(t, s1.RPC)
   527  
   528  	require := require.New(t)
   529  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   530  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   531  
   532  	// Create a "dead" job
   533  	state := s1.fsm.State()
   534  	job := mock.Job()
   535  	job.Type = structs.JobTypeBatch
   536  	job.Status = structs.JobStatusDead
   537  	job.Stop = true
   538  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
   539  		Attempts: 0,
   540  		Interval: 0 * time.Second,
   541  	}
   542  	err := state.UpsertJob(1001, job)
   543  	require.Nil(err)
   544  
   545  	// Insert "complete" eval
   546  	eval := mock.Eval()
   547  	eval.Status = structs.EvalStatusComplete
   548  	eval.Type = structs.JobTypeBatch
   549  	eval.JobID = job.ID
   550  	err = state.UpsertEvals(1002, []*structs.Evaluation{eval})
   551  	require.Nil(err)
   552  
   553  	// Insert "failed" alloc
   554  	alloc := mock.Alloc()
   555  	alloc.JobID = job.ID
   556  	alloc.EvalID = eval.ID
   557  	alloc.TaskGroup = job.TaskGroups[0].Name
   558  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   559  
   560  	// Insert "lost" alloc
   561  	alloc2 := mock.Alloc()
   562  	alloc2.JobID = job.ID
   563  	alloc2.EvalID = eval.ID
   564  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   565  	alloc2.ClientStatus = structs.AllocClientStatusLost
   566  	alloc2.TaskGroup = job.TaskGroups[0].Name
   567  
   568  	err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2})
   569  	if err != nil {
   570  		t.Fatalf("err: %v", err)
   571  	}
   572  
   573  	// Update the time tables to make this work
   574  	tt := s1.fsm.TimeTable()
   575  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   576  
   577  	// Create a core scheduler
   578  	snap, err := state.Snapshot()
   579  	if err != nil {
   580  		t.Fatalf("err: %v", err)
   581  	}
   582  	core := NewCoreScheduler(s1, snap)
   583  
   584  	// Attempt the GC
   585  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   586  	err = core.Process(gc)
   587  	if err != nil {
   588  		t.Fatalf("err: %v", err)
   589  	}
   590  
   591  	// Everything should be gone
   592  	ws := memdb.NewWatchSet()
   593  	out, err := state.EvalByID(ws, eval.ID)
   594  	if err != nil {
   595  		t.Fatalf("err: %v", err)
   596  	}
   597  	if out != nil {
   598  		t.Fatalf("bad: %v", out)
   599  	}
   600  
   601  	outA, err := state.AllocByID(ws, alloc.ID)
   602  	if err != nil {
   603  		t.Fatalf("err: %v", err)
   604  	}
   605  	if outA != nil {
   606  		t.Fatalf("bad: %v", outA)
   607  	}
   608  
   609  	outA2, err := state.AllocByID(ws, alloc2.ID)
   610  	if err != nil {
   611  		t.Fatalf("err: %v", err)
   612  	}
   613  	if outA2 != nil {
   614  		t.Fatalf("bad: %v", outA2)
   615  	}
   616  }
   617  
   618  func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
   619  	t.Parallel()
   620  
   621  	s1, cleanupS1 := TestServer(t, nil)
   622  	defer cleanupS1()
   623  	testutil.WaitForLeader(t, s1.RPC)
   624  	require := require.New(t)
   625  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   626  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   627  
   628  	// Insert "dead" eval
   629  	state := s1.fsm.State()
   630  	eval := mock.Eval()
   631  	eval.Status = structs.EvalStatusComplete
   632  	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   633  	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   634  	if err != nil {
   635  		t.Fatalf("err: %v", err)
   636  	}
   637  
   638  	// Create mock job with id same as eval
   639  	job := mock.Job()
   640  	job.ID = eval.JobID
   641  
   642  	// Insert "dead" alloc
   643  	alloc := mock.Alloc()
   644  	alloc.JobID = job.ID
   645  	alloc.EvalID = eval.ID
   646  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   647  	alloc.TaskGroup = job.TaskGroups[0].Name
   648  	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   649  
   650  	// Insert "lost" alloc
   651  	alloc2 := mock.Alloc()
   652  	alloc2.JobID = job.ID
   653  	alloc2.EvalID = eval.ID
   654  	alloc2.TaskGroup = job.TaskGroups[0].Name
   655  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
   656  	alloc2.ClientStatus = structs.AllocClientStatusLost
   657  
   658  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
   659  	if err != nil {
   660  		t.Fatalf("err: %v", err)
   661  	}
   662  
   663  	// Insert "running" alloc
   664  	alloc3 := mock.Alloc()
   665  	alloc3.EvalID = eval.ID
   666  	alloc3.JobID = job.ID
   667  	state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID))
   668  	err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3})
   669  	if err != nil {
   670  		t.Fatalf("err: %v", err)
   671  	}
   672  
   673  	// Insert mock job with rescheduling disabled
   674  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
   675  		Attempts: 0,
   676  		Interval: 0 * time.Second,
   677  	}
   678  	err = state.UpsertJob(1001, job)
   679  	require.Nil(err)
   680  
   681  	// Update the time tables to make this work
   682  	tt := s1.fsm.TimeTable()
   683  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
   684  
   685  	// Create a core scheduler
   686  	snap, err := state.Snapshot()
   687  	if err != nil {
   688  		t.Fatalf("err: %v", err)
   689  	}
   690  	core := NewCoreScheduler(s1, snap)
   691  
   692  	// Attempt the GC
   693  	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
   694  	err = core.Process(gc)
   695  	if err != nil {
   696  		t.Fatalf("err: %v", err)
   697  	}
   698  
   699  	// Should not be gone
   700  	ws := memdb.NewWatchSet()
   701  	out, err := state.EvalByID(ws, eval.ID)
   702  	if err != nil {
   703  		t.Fatalf("err: %v", err)
   704  	}
   705  	if out == nil {
   706  		t.Fatalf("bad: %v", out)
   707  	}
   708  
   709  	outA, err := state.AllocByID(ws, alloc3.ID)
   710  	if err != nil {
   711  		t.Fatalf("err: %v", err)
   712  	}
   713  	if outA == nil {
   714  		t.Fatalf("bad: %v", outA)
   715  	}
   716  
   717  	// Should be gone
   718  	outB, err := state.AllocByID(ws, alloc.ID)
   719  	if err != nil {
   720  		t.Fatalf("err: %v", err)
   721  	}
   722  	if outB != nil {
   723  		t.Fatalf("bad: %v", outB)
   724  	}
   725  
   726  	outC, err := state.AllocByID(ws, alloc2.ID)
   727  	if err != nil {
   728  		t.Fatalf("err: %v", err)
   729  	}
   730  	if outC != nil {
   731  		t.Fatalf("bad: %v", outC)
   732  	}
   733  }
   734  
   735  func TestCoreScheduler_EvalGC_Force(t *testing.T) {
   736  	t.Parallel()
   737  	for _, withAcl := range []bool{false, true} {
   738  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
   739  			require := require.New(t)
   740  			var server *Server
   741  			var cleanup func()
   742  			if withAcl {
   743  				server, _, cleanup = TestACLServer(t, nil)
   744  			} else {
   745  				server, cleanup = TestServer(t, nil)
   746  			}
   747  			defer cleanup()
   748  			testutil.WaitForLeader(t, server.RPC)
   749  
   750  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   751  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   752  
   753  			// Insert "dead" eval
   754  			state := server.fsm.State()
   755  			eval := mock.Eval()
   756  			eval.Status = structs.EvalStatusFailed
   757  			state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
   758  			err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
   759  			if err != nil {
   760  				t.Fatalf("err: %v", err)
   761  			}
   762  
   763  			// Insert mock job with rescheduling disabled
   764  			job := mock.Job()
   765  			job.ID = eval.JobID
   766  			job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
   767  				Attempts: 0,
   768  				Interval: 0 * time.Second,
   769  			}
   770  			err = state.UpsertJob(1001, job)
   771  			require.Nil(err)
   772  
   773  			// Insert "dead" alloc
   774  			alloc := mock.Alloc()
   775  			alloc.EvalID = eval.ID
   776  			alloc.DesiredStatus = structs.AllocDesiredStatusStop
   777  			alloc.TaskGroup = job.TaskGroups[0].Name
   778  			state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   779  			err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
   780  			if err != nil {
   781  				t.Fatalf("err: %v", err)
   782  			}
   783  
   784  			// Create a core scheduler
   785  			snap, err := state.Snapshot()
   786  			if err != nil {
   787  				t.Fatalf("err: %v", err)
   788  			}
   789  			core := NewCoreScheduler(server, snap)
   790  
   791  			// Attempt the GC
   792  			gc := server.coreJobEval(structs.CoreJobForceGC, 1002)
   793  			err = core.Process(gc)
   794  			if err != nil {
   795  				t.Fatalf("err: %v", err)
   796  			}
   797  
   798  			// Should be gone
   799  			ws := memdb.NewWatchSet()
   800  			out, err := state.EvalByID(ws, eval.ID)
   801  			if err != nil {
   802  				t.Fatalf("err: %v", err)
   803  			}
   804  			if out != nil {
   805  				t.Fatalf("bad: %v", out)
   806  			}
   807  
   808  			outA, err := state.AllocByID(ws, alloc.ID)
   809  			if err != nil {
   810  				t.Fatalf("err: %v", err)
   811  			}
   812  			if outA != nil {
   813  				t.Fatalf("bad: %v", outA)
   814  			}
   815  		})
   816  	}
   817  }
   818  
   819  func TestCoreScheduler_NodeGC(t *testing.T) {
   820  	t.Parallel()
   821  	for _, withAcl := range []bool{false, true} {
   822  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
   823  			var server *Server
   824  			var cleanup func()
   825  			if withAcl {
   826  				server, _, cleanup = TestACLServer(t, nil)
   827  			} else {
   828  				server, cleanup = TestServer(t, nil)
   829  			}
   830  			defer cleanup()
   831  			testutil.WaitForLeader(t, server.RPC)
   832  
   833  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   834  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   835  
   836  			// Insert "dead" node
   837  			state := server.fsm.State()
   838  			node := mock.Node()
   839  			node.Status = structs.NodeStatusDown
   840  			err := state.UpsertNode(1000, node)
   841  			if err != nil {
   842  				t.Fatalf("err: %v", err)
   843  			}
   844  
   845  			// Update the time tables to make this work
   846  			tt := server.fsm.TimeTable()
   847  			tt.Witness(2000, time.Now().UTC().Add(-1*server.config.NodeGCThreshold))
   848  
   849  			// Create a core scheduler
   850  			snap, err := state.Snapshot()
   851  			if err != nil {
   852  				t.Fatalf("err: %v", err)
   853  			}
   854  			core := NewCoreScheduler(server, snap)
   855  
   856  			// Attempt the GC
   857  			gc := server.coreJobEval(structs.CoreJobNodeGC, 2000)
   858  			err = core.Process(gc)
   859  			if err != nil {
   860  				t.Fatalf("err: %v", err)
   861  			}
   862  
   863  			// Should be gone
   864  			ws := memdb.NewWatchSet()
   865  			out, err := state.NodeByID(ws, node.ID)
   866  			if err != nil {
   867  				t.Fatalf("err: %v", err)
   868  			}
   869  			if out != nil {
   870  				t.Fatalf("bad: %v", out)
   871  			}
   872  		})
   873  	}
   874  }
   875  
   876  func TestCoreScheduler_NodeGC_TerminalAllocs(t *testing.T) {
   877  	t.Parallel()
   878  
   879  	s1, cleanupS1 := TestServer(t, nil)
   880  	defer cleanupS1()
   881  	testutil.WaitForLeader(t, s1.RPC)
   882  
   883  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   884  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   885  
   886  	// Insert "dead" node
   887  	state := s1.fsm.State()
   888  	node := mock.Node()
   889  	node.Status = structs.NodeStatusDown
   890  	err := state.UpsertNode(1000, node)
   891  	if err != nil {
   892  		t.Fatalf("err: %v", err)
   893  	}
   894  
   895  	// Insert a terminal alloc on that node
   896  	alloc := mock.Alloc()
   897  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
   898  	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   899  	if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil {
   900  		t.Fatalf("err: %v", err)
   901  	}
   902  
   903  	// Update the time tables to make this work
   904  	tt := s1.fsm.TimeTable()
   905  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold))
   906  
   907  	// Create a core scheduler
   908  	snap, err := state.Snapshot()
   909  	if err != nil {
   910  		t.Fatalf("err: %v", err)
   911  	}
   912  	core := NewCoreScheduler(s1, snap)
   913  
   914  	// Attempt the GC
   915  	gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000)
   916  	err = core.Process(gc)
   917  	if err != nil {
   918  		t.Fatalf("err: %v", err)
   919  	}
   920  
   921  	// Should be gone
   922  	ws := memdb.NewWatchSet()
   923  	out, err := state.NodeByID(ws, node.ID)
   924  	if err != nil {
   925  		t.Fatalf("err: %v", err)
   926  	}
   927  	if out != nil {
   928  		t.Fatalf("bad: %v", out)
   929  	}
   930  }
   931  
   932  func TestCoreScheduler_NodeGC_RunningAllocs(t *testing.T) {
   933  	t.Parallel()
   934  
   935  	s1, cleanupS1 := TestServer(t, nil)
   936  	defer cleanupS1()
   937  	testutil.WaitForLeader(t, s1.RPC)
   938  
   939  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   940  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   941  
   942  	// Insert "dead" node
   943  	state := s1.fsm.State()
   944  	node := mock.Node()
   945  	node.Status = structs.NodeStatusDown
   946  	err := state.UpsertNode(1000, node)
   947  	if err != nil {
   948  		t.Fatalf("err: %v", err)
   949  	}
   950  
   951  	// Insert a running alloc on that node
   952  	alloc := mock.Alloc()
   953  	alloc.NodeID = node.ID
   954  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
   955  	alloc.ClientStatus = structs.AllocClientStatusRunning
   956  	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
   957  	if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil {
   958  		t.Fatalf("err: %v", err)
   959  	}
   960  
   961  	// Update the time tables to make this work
   962  	tt := s1.fsm.TimeTable()
   963  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold))
   964  
   965  	// Create a core scheduler
   966  	snap, err := state.Snapshot()
   967  	if err != nil {
   968  		t.Fatalf("err: %v", err)
   969  	}
   970  	core := NewCoreScheduler(s1, snap)
   971  
   972  	// Attempt the GC
   973  	gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000)
   974  	err = core.Process(gc)
   975  	if err != nil {
   976  		t.Fatalf("err: %v", err)
   977  	}
   978  
   979  	// Should still be here
   980  	ws := memdb.NewWatchSet()
   981  	out, err := state.NodeByID(ws, node.ID)
   982  	if err != nil {
   983  		t.Fatalf("err: %v", err)
   984  	}
   985  	if out == nil {
   986  		t.Fatalf("bad: %v", out)
   987  	}
   988  }
   989  
   990  func TestCoreScheduler_NodeGC_Force(t *testing.T) {
   991  	t.Parallel()
   992  
   993  	s1, cleanupS1 := TestServer(t, nil)
   994  	defer cleanupS1()
   995  	testutil.WaitForLeader(t, s1.RPC)
   996  
   997  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
   998  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
   999  
  1000  	// Insert "dead" node
  1001  	state := s1.fsm.State()
  1002  	node := mock.Node()
  1003  	node.Status = structs.NodeStatusDown
  1004  	err := state.UpsertNode(1000, node)
  1005  	if err != nil {
  1006  		t.Fatalf("err: %v", err)
  1007  	}
  1008  
  1009  	// Create a core scheduler
  1010  	snap, err := state.Snapshot()
  1011  	if err != nil {
  1012  		t.Fatalf("err: %v", err)
  1013  	}
  1014  	core := NewCoreScheduler(s1, snap)
  1015  
  1016  	// Attempt the GC
  1017  	gc := s1.coreJobEval(structs.CoreJobForceGC, 1000)
  1018  	err = core.Process(gc)
  1019  	if err != nil {
  1020  		t.Fatalf("err: %v", err)
  1021  	}
  1022  
  1023  	// Should be gone
  1024  	ws := memdb.NewWatchSet()
  1025  	out, err := state.NodeByID(ws, node.ID)
  1026  	if err != nil {
  1027  		t.Fatalf("err: %v", err)
  1028  	}
  1029  	if out != nil {
  1030  		t.Fatalf("bad: %v", out)
  1031  	}
  1032  }
  1033  
  1034  func TestCoreScheduler_JobGC_OutstandingEvals(t *testing.T) {
  1035  	t.Parallel()
  1036  
  1037  	s1, cleanupS1 := TestServer(t, nil)
  1038  	defer cleanupS1()
  1039  	testutil.WaitForLeader(t, s1.RPC)
  1040  
  1041  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1042  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1043  
  1044  	// Insert job.
  1045  	state := s1.fsm.State()
  1046  	job := mock.Job()
  1047  	job.Type = structs.JobTypeBatch
  1048  	job.Status = structs.JobStatusDead
  1049  	err := state.UpsertJob(1000, job)
  1050  	if err != nil {
  1051  		t.Fatalf("err: %v", err)
  1052  	}
  1053  
  1054  	// Insert two evals, one terminal and one not
  1055  	eval := mock.Eval()
  1056  	eval.JobID = job.ID
  1057  	eval.Status = structs.EvalStatusComplete
  1058  
  1059  	eval2 := mock.Eval()
  1060  	eval2.JobID = job.ID
  1061  	eval2.Status = structs.EvalStatusPending
  1062  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2})
  1063  	if err != nil {
  1064  		t.Fatalf("err: %v", err)
  1065  	}
  1066  
  1067  	// Update the time tables to make this work
  1068  	tt := s1.fsm.TimeTable()
  1069  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1070  
  1071  	// Create a core scheduler
  1072  	snap, err := state.Snapshot()
  1073  	if err != nil {
  1074  		t.Fatalf("err: %v", err)
  1075  	}
  1076  	core := NewCoreScheduler(s1, snap)
  1077  
  1078  	// Attempt the GC
  1079  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1080  	err = core.Process(gc)
  1081  	if err != nil {
  1082  		t.Fatalf("err: %v", err)
  1083  	}
  1084  
  1085  	// Should still exist
  1086  	ws := memdb.NewWatchSet()
  1087  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1088  	if err != nil {
  1089  		t.Fatalf("err: %v", err)
  1090  	}
  1091  	if out == nil {
  1092  		t.Fatalf("bad: %v", out)
  1093  	}
  1094  
  1095  	outE, err := state.EvalByID(ws, eval.ID)
  1096  	if err != nil {
  1097  		t.Fatalf("err: %v", err)
  1098  	}
  1099  	if outE == nil {
  1100  		t.Fatalf("bad: %v", outE)
  1101  	}
  1102  
  1103  	outE2, err := state.EvalByID(ws, eval2.ID)
  1104  	if err != nil {
  1105  		t.Fatalf("err: %v", err)
  1106  	}
  1107  	if outE2 == nil {
  1108  		t.Fatalf("bad: %v", outE2)
  1109  	}
  1110  
  1111  	// Update the second eval to be terminal
  1112  	eval2.Status = structs.EvalStatusComplete
  1113  	err = state.UpsertEvals(1003, []*structs.Evaluation{eval2})
  1114  	if err != nil {
  1115  		t.Fatalf("err: %v", err)
  1116  	}
  1117  
  1118  	// Create a core scheduler
  1119  	snap, err = state.Snapshot()
  1120  	if err != nil {
  1121  		t.Fatalf("err: %v", err)
  1122  	}
  1123  	core = NewCoreScheduler(s1, snap)
  1124  
  1125  	// Attempt the GC
  1126  	gc = s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1127  	err = core.Process(gc)
  1128  	if err != nil {
  1129  		t.Fatalf("err: %v", err)
  1130  	}
  1131  
  1132  	// Should not still exist
  1133  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1134  	if err != nil {
  1135  		t.Fatalf("err: %v", err)
  1136  	}
  1137  	if out != nil {
  1138  		t.Fatalf("bad: %v", out)
  1139  	}
  1140  
  1141  	outE, err = state.EvalByID(ws, eval.ID)
  1142  	if err != nil {
  1143  		t.Fatalf("err: %v", err)
  1144  	}
  1145  	if outE != nil {
  1146  		t.Fatalf("bad: %v", outE)
  1147  	}
  1148  
  1149  	outE2, err = state.EvalByID(ws, eval2.ID)
  1150  	if err != nil {
  1151  		t.Fatalf("err: %v", err)
  1152  	}
  1153  	if outE2 != nil {
  1154  		t.Fatalf("bad: %v", outE2)
  1155  	}
  1156  }
  1157  
  1158  func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) {
  1159  	t.Parallel()
  1160  
  1161  	s1, cleanupS1 := TestServer(t, nil)
  1162  	defer cleanupS1()
  1163  	testutil.WaitForLeader(t, s1.RPC)
  1164  
  1165  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1166  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1167  
  1168  	// Insert job.
  1169  	state := s1.fsm.State()
  1170  	job := mock.Job()
  1171  	job.Type = structs.JobTypeBatch
  1172  	job.Status = structs.JobStatusDead
  1173  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
  1174  		Attempts: 0,
  1175  		Interval: 0 * time.Second,
  1176  	}
  1177  	err := state.UpsertJob(1000, job)
  1178  	if err != nil {
  1179  		t.Fatalf("err: %v", err)
  1180  	}
  1181  
  1182  	// Insert an eval
  1183  	eval := mock.Eval()
  1184  	eval.JobID = job.ID
  1185  	eval.Status = structs.EvalStatusComplete
  1186  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
  1187  	if err != nil {
  1188  		t.Fatalf("err: %v", err)
  1189  	}
  1190  
  1191  	// Insert two allocs, one terminal and one not
  1192  	alloc := mock.Alloc()
  1193  	alloc.JobID = job.ID
  1194  	alloc.EvalID = eval.ID
  1195  	alloc.DesiredStatus = structs.AllocDesiredStatusRun
  1196  	alloc.ClientStatus = structs.AllocClientStatusComplete
  1197  	alloc.TaskGroup = job.TaskGroups[0].Name
  1198  
  1199  	alloc2 := mock.Alloc()
  1200  	alloc2.JobID = job.ID
  1201  	alloc2.EvalID = eval.ID
  1202  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
  1203  	alloc2.ClientStatus = structs.AllocClientStatusRunning
  1204  	alloc2.TaskGroup = job.TaskGroups[0].Name
  1205  
  1206  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
  1207  	if err != nil {
  1208  		t.Fatalf("err: %v", err)
  1209  	}
  1210  
  1211  	// Update the time tables to make this work
  1212  	tt := s1.fsm.TimeTable()
  1213  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1214  
  1215  	// Create a core scheduler
  1216  	snap, err := state.Snapshot()
  1217  	if err != nil {
  1218  		t.Fatalf("err: %v", err)
  1219  	}
  1220  	core := NewCoreScheduler(s1, snap)
  1221  
  1222  	// Attempt the GC
  1223  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1224  	err = core.Process(gc)
  1225  	if err != nil {
  1226  		t.Fatalf("err: %v", err)
  1227  	}
  1228  
  1229  	// Should still exist
  1230  	ws := memdb.NewWatchSet()
  1231  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1232  	if err != nil {
  1233  		t.Fatalf("err: %v", err)
  1234  	}
  1235  	if out == nil {
  1236  		t.Fatalf("bad: %v", out)
  1237  	}
  1238  
  1239  	outA, err := state.AllocByID(ws, alloc.ID)
  1240  	if err != nil {
  1241  		t.Fatalf("err: %v", err)
  1242  	}
  1243  	if outA == nil {
  1244  		t.Fatalf("bad: %v", outA)
  1245  	}
  1246  
  1247  	outA2, err := state.AllocByID(ws, alloc2.ID)
  1248  	if err != nil {
  1249  		t.Fatalf("err: %v", err)
  1250  	}
  1251  	if outA2 == nil {
  1252  		t.Fatalf("bad: %v", outA2)
  1253  	}
  1254  
  1255  	// Update the second alloc to be terminal
  1256  	alloc2.ClientStatus = structs.AllocClientStatusComplete
  1257  	err = state.UpsertAllocs(1003, []*structs.Allocation{alloc2})
  1258  	if err != nil {
  1259  		t.Fatalf("err: %v", err)
  1260  	}
  1261  
  1262  	// Create a core scheduler
  1263  	snap, err = state.Snapshot()
  1264  	if err != nil {
  1265  		t.Fatalf("err: %v", err)
  1266  	}
  1267  	core = NewCoreScheduler(s1, snap)
  1268  
  1269  	// Attempt the GC
  1270  	gc = s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1271  	err = core.Process(gc)
  1272  	if err != nil {
  1273  		t.Fatalf("err: %v", err)
  1274  	}
  1275  
  1276  	// Should not still exist
  1277  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1278  	if err != nil {
  1279  		t.Fatalf("err: %v", err)
  1280  	}
  1281  	if out != nil {
  1282  		t.Fatalf("bad: %v", out)
  1283  	}
  1284  
  1285  	outA, err = state.AllocByID(ws, alloc.ID)
  1286  	if err != nil {
  1287  		t.Fatalf("err: %v", err)
  1288  	}
  1289  	if outA != nil {
  1290  		t.Fatalf("bad: %v", outA)
  1291  	}
  1292  
  1293  	outA2, err = state.AllocByID(ws, alloc2.ID)
  1294  	if err != nil {
  1295  		t.Fatalf("err: %v", err)
  1296  	}
  1297  	if outA2 != nil {
  1298  		t.Fatalf("bad: %v", outA2)
  1299  	}
  1300  }
  1301  
  1302  // This test ensures that batch jobs are GC'd in one shot, meaning it all
  1303  // allocs/evals and job or nothing
  1304  func TestCoreScheduler_JobGC_OneShot(t *testing.T) {
  1305  	t.Parallel()
  1306  
  1307  	s1, cleanupS1 := TestServer(t, nil)
  1308  	defer cleanupS1()
  1309  	testutil.WaitForLeader(t, s1.RPC)
  1310  
  1311  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1312  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1313  
  1314  	// Insert job.
  1315  	state := s1.fsm.State()
  1316  	job := mock.Job()
  1317  	job.Type = structs.JobTypeBatch
  1318  	err := state.UpsertJob(1000, job)
  1319  	if err != nil {
  1320  		t.Fatalf("err: %v", err)
  1321  	}
  1322  
  1323  	// Insert two complete evals
  1324  	eval := mock.Eval()
  1325  	eval.JobID = job.ID
  1326  	eval.Status = structs.EvalStatusComplete
  1327  
  1328  	eval2 := mock.Eval()
  1329  	eval2.JobID = job.ID
  1330  	eval2.Status = structs.EvalStatusComplete
  1331  
  1332  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2})
  1333  	if err != nil {
  1334  		t.Fatalf("err: %v", err)
  1335  	}
  1336  
  1337  	// Insert one complete alloc and one running on distinct evals
  1338  	alloc := mock.Alloc()
  1339  	alloc.JobID = job.ID
  1340  	alloc.EvalID = eval.ID
  1341  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
  1342  
  1343  	alloc2 := mock.Alloc()
  1344  	alloc2.JobID = job.ID
  1345  	alloc2.EvalID = eval2.ID
  1346  	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
  1347  
  1348  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
  1349  	if err != nil {
  1350  		t.Fatalf("err: %v", err)
  1351  	}
  1352  
  1353  	// Force the jobs state to dead
  1354  	job.Status = structs.JobStatusDead
  1355  
  1356  	// Update the time tables to make this work
  1357  	tt := s1.fsm.TimeTable()
  1358  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1359  
  1360  	// Create a core scheduler
  1361  	snap, err := state.Snapshot()
  1362  	if err != nil {
  1363  		t.Fatalf("err: %v", err)
  1364  	}
  1365  	core := NewCoreScheduler(s1, snap)
  1366  
  1367  	// Attempt the GC
  1368  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1369  	err = core.Process(gc)
  1370  	if err != nil {
  1371  		t.Fatalf("err: %v", err)
  1372  	}
  1373  
  1374  	// Should still exist
  1375  	ws := memdb.NewWatchSet()
  1376  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1377  	if err != nil {
  1378  		t.Fatalf("err: %v", err)
  1379  	}
  1380  	if out == nil {
  1381  		t.Fatalf("bad: %v", out)
  1382  	}
  1383  
  1384  	outE, err := state.EvalByID(ws, eval.ID)
  1385  	if err != nil {
  1386  		t.Fatalf("err: %v", err)
  1387  	}
  1388  	if outE == nil {
  1389  		t.Fatalf("bad: %v", outE)
  1390  	}
  1391  
  1392  	outE2, err := state.EvalByID(ws, eval2.ID)
  1393  	if err != nil {
  1394  		t.Fatalf("err: %v", err)
  1395  	}
  1396  	if outE2 == nil {
  1397  		t.Fatalf("bad: %v", outE2)
  1398  	}
  1399  
  1400  	outA, err := state.AllocByID(ws, alloc.ID)
  1401  	if err != nil {
  1402  		t.Fatalf("err: %v", err)
  1403  	}
  1404  	if outA == nil {
  1405  		t.Fatalf("bad: %v", outA)
  1406  	}
  1407  	outA2, err := state.AllocByID(ws, alloc2.ID)
  1408  	if err != nil {
  1409  		t.Fatalf("err: %v", err)
  1410  	}
  1411  	if outA2 == nil {
  1412  		t.Fatalf("bad: %v", outA2)
  1413  	}
  1414  }
  1415  
  1416  // This test ensures that stopped jobs are GCd
  1417  func TestCoreScheduler_JobGC_Stopped(t *testing.T) {
  1418  	t.Parallel()
  1419  
  1420  	s1, cleanupS1 := TestServer(t, nil)
  1421  	defer cleanupS1()
  1422  	testutil.WaitForLeader(t, s1.RPC)
  1423  
  1424  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1425  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1426  
  1427  	// Insert job.
  1428  	state := s1.fsm.State()
  1429  	job := mock.Job()
  1430  	job.Stop = true
  1431  	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
  1432  		Attempts: 0,
  1433  		Interval: 0 * time.Second,
  1434  	}
  1435  	err := state.UpsertJob(1000, job)
  1436  	if err != nil {
  1437  		t.Fatalf("err: %v", err)
  1438  	}
  1439  
  1440  	// Insert two complete evals
  1441  	eval := mock.Eval()
  1442  	eval.JobID = job.ID
  1443  	eval.Status = structs.EvalStatusComplete
  1444  
  1445  	eval2 := mock.Eval()
  1446  	eval2.JobID = job.ID
  1447  	eval2.Status = structs.EvalStatusComplete
  1448  
  1449  	err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2})
  1450  	if err != nil {
  1451  		t.Fatalf("err: %v", err)
  1452  	}
  1453  
  1454  	// Insert one complete alloc
  1455  	alloc := mock.Alloc()
  1456  	alloc.JobID = job.ID
  1457  	alloc.EvalID = eval.ID
  1458  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
  1459  	alloc.TaskGroup = job.TaskGroups[0].Name
  1460  	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
  1461  	if err != nil {
  1462  		t.Fatalf("err: %v", err)
  1463  	}
  1464  
  1465  	// Update the time tables to make this work
  1466  	tt := s1.fsm.TimeTable()
  1467  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold))
  1468  
  1469  	// Create a core scheduler
  1470  	snap, err := state.Snapshot()
  1471  	if err != nil {
  1472  		t.Fatalf("err: %v", err)
  1473  	}
  1474  	core := NewCoreScheduler(s1, snap)
  1475  
  1476  	// Attempt the GC
  1477  	gc := s1.coreJobEval(structs.CoreJobJobGC, 2000)
  1478  	err = core.Process(gc)
  1479  	if err != nil {
  1480  		t.Fatalf("err: %v", err)
  1481  	}
  1482  
  1483  	// Shouldn't still exist
  1484  	ws := memdb.NewWatchSet()
  1485  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1486  	if err != nil {
  1487  		t.Fatalf("err: %v", err)
  1488  	}
  1489  	if out != nil {
  1490  		t.Fatalf("bad: %v", out)
  1491  	}
  1492  
  1493  	outE, err := state.EvalByID(ws, eval.ID)
  1494  	if err != nil {
  1495  		t.Fatalf("err: %v", err)
  1496  	}
  1497  	if outE != nil {
  1498  		t.Fatalf("bad: %v", outE)
  1499  	}
  1500  
  1501  	outE2, err := state.EvalByID(ws, eval2.ID)
  1502  	if err != nil {
  1503  		t.Fatalf("err: %v", err)
  1504  	}
  1505  	if outE2 != nil {
  1506  		t.Fatalf("bad: %v", outE2)
  1507  	}
  1508  
  1509  	outA, err := state.AllocByID(ws, alloc.ID)
  1510  	if err != nil {
  1511  		t.Fatalf("err: %v", err)
  1512  	}
  1513  	if outA != nil {
  1514  		t.Fatalf("bad: %v", outA)
  1515  	}
  1516  }
  1517  
  1518  func TestCoreScheduler_JobGC_Force(t *testing.T) {
  1519  	t.Parallel()
  1520  	for _, withAcl := range []bool{false, true} {
  1521  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
  1522  			var server *Server
  1523  			var cleanup func()
  1524  			if withAcl {
  1525  				server, _, cleanup = TestACLServer(t, nil)
  1526  			} else {
  1527  				server, cleanup = TestServer(t, nil)
  1528  			}
  1529  			defer cleanup()
  1530  			testutil.WaitForLeader(t, server.RPC)
  1531  
  1532  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1533  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1534  
  1535  			// Insert job.
  1536  			state := server.fsm.State()
  1537  			job := mock.Job()
  1538  			job.Type = structs.JobTypeBatch
  1539  			job.Status = structs.JobStatusDead
  1540  			err := state.UpsertJob(1000, job)
  1541  			if err != nil {
  1542  				t.Fatalf("err: %v", err)
  1543  			}
  1544  
  1545  			// Insert a terminal eval
  1546  			eval := mock.Eval()
  1547  			eval.JobID = job.ID
  1548  			eval.Status = structs.EvalStatusComplete
  1549  			err = state.UpsertEvals(1001, []*structs.Evaluation{eval})
  1550  			if err != nil {
  1551  				t.Fatalf("err: %v", err)
  1552  			}
  1553  
  1554  			// Create a core scheduler
  1555  			snap, err := state.Snapshot()
  1556  			if err != nil {
  1557  				t.Fatalf("err: %v", err)
  1558  			}
  1559  			core := NewCoreScheduler(server, snap)
  1560  
  1561  			// Attempt the GC
  1562  			gc := server.coreJobEval(structs.CoreJobForceGC, 1002)
  1563  			err = core.Process(gc)
  1564  			if err != nil {
  1565  				t.Fatalf("err: %v", err)
  1566  			}
  1567  
  1568  			// Shouldn't still exist
  1569  			ws := memdb.NewWatchSet()
  1570  			out, err := state.JobByID(ws, job.Namespace, job.ID)
  1571  			if err != nil {
  1572  				t.Fatalf("err: %v", err)
  1573  			}
  1574  			if out != nil {
  1575  				t.Fatalf("bad: %v", out)
  1576  			}
  1577  
  1578  			outE, err := state.EvalByID(ws, eval.ID)
  1579  			if err != nil {
  1580  				t.Fatalf("err: %v", err)
  1581  			}
  1582  			if outE != nil {
  1583  				t.Fatalf("bad: %v", outE)
  1584  			}
  1585  		})
  1586  	}
  1587  }
  1588  
  1589  // This test ensures parameterized jobs only get gc'd when stopped
  1590  func TestCoreScheduler_JobGC_Parameterized(t *testing.T) {
  1591  	t.Parallel()
  1592  
  1593  	s1, cleanupS1 := TestServer(t, nil)
  1594  	defer cleanupS1()
  1595  	testutil.WaitForLeader(t, s1.RPC)
  1596  
  1597  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1598  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1599  
  1600  	// Insert a parameterized job.
  1601  	state := s1.fsm.State()
  1602  	job := mock.Job()
  1603  	job.Type = structs.JobTypeBatch
  1604  	job.Status = structs.JobStatusRunning
  1605  	job.ParameterizedJob = &structs.ParameterizedJobConfig{
  1606  		Payload: structs.DispatchPayloadRequired,
  1607  	}
  1608  	err := state.UpsertJob(1000, job)
  1609  	if err != nil {
  1610  		t.Fatalf("err: %v", err)
  1611  	}
  1612  
  1613  	// Create a core scheduler
  1614  	snap, err := state.Snapshot()
  1615  	if err != nil {
  1616  		t.Fatalf("err: %v", err)
  1617  	}
  1618  	core := NewCoreScheduler(s1, snap)
  1619  
  1620  	// Attempt the GC
  1621  	gc := s1.coreJobEval(structs.CoreJobForceGC, 1002)
  1622  	err = core.Process(gc)
  1623  	if err != nil {
  1624  		t.Fatalf("err: %v", err)
  1625  	}
  1626  
  1627  	// Should still exist
  1628  	ws := memdb.NewWatchSet()
  1629  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1630  	if err != nil {
  1631  		t.Fatalf("err: %v", err)
  1632  	}
  1633  	if out == nil {
  1634  		t.Fatalf("bad: %v", out)
  1635  	}
  1636  
  1637  	// Mark the job as stopped and try again
  1638  	job2 := job.Copy()
  1639  	job2.Stop = true
  1640  	err = state.UpsertJob(2000, job2)
  1641  	if err != nil {
  1642  		t.Fatalf("err: %v", err)
  1643  	}
  1644  
  1645  	// Create a core scheduler
  1646  	snap, err = state.Snapshot()
  1647  	if err != nil {
  1648  		t.Fatalf("err: %v", err)
  1649  	}
  1650  	core = NewCoreScheduler(s1, snap)
  1651  
  1652  	// Attempt the GC
  1653  	gc = s1.coreJobEval(structs.CoreJobForceGC, 2002)
  1654  	err = core.Process(gc)
  1655  	if err != nil {
  1656  		t.Fatalf("err: %v", err)
  1657  	}
  1658  
  1659  	// Should not exist
  1660  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1661  	if err != nil {
  1662  		t.Fatalf("err: %v", err)
  1663  	}
  1664  	if out != nil {
  1665  		t.Fatalf("bad: %+v", out)
  1666  	}
  1667  }
  1668  
  1669  // This test ensures periodic jobs don't get GCd until they are stopped
  1670  func TestCoreScheduler_JobGC_Periodic(t *testing.T) {
  1671  	t.Parallel()
  1672  
  1673  	s1, cleanupS1 := TestServer(t, nil)
  1674  	defer cleanupS1()
  1675  	testutil.WaitForLeader(t, s1.RPC)
  1676  
  1677  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1678  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1679  
  1680  	// Insert a parameterized job.
  1681  	state := s1.fsm.State()
  1682  	job := mock.PeriodicJob()
  1683  	err := state.UpsertJob(1000, job)
  1684  	if err != nil {
  1685  		t.Fatalf("err: %v", err)
  1686  	}
  1687  
  1688  	// Create a core scheduler
  1689  	snap, err := state.Snapshot()
  1690  	if err != nil {
  1691  		t.Fatalf("err: %v", err)
  1692  	}
  1693  	core := NewCoreScheduler(s1, snap)
  1694  
  1695  	// Attempt the GC
  1696  	gc := s1.coreJobEval(structs.CoreJobForceGC, 1002)
  1697  	err = core.Process(gc)
  1698  	if err != nil {
  1699  		t.Fatalf("err: %v", err)
  1700  	}
  1701  
  1702  	// Should still exist
  1703  	ws := memdb.NewWatchSet()
  1704  	out, err := state.JobByID(ws, job.Namespace, job.ID)
  1705  	if err != nil {
  1706  		t.Fatalf("err: %v", err)
  1707  	}
  1708  	if out == nil {
  1709  		t.Fatalf("bad: %v", out)
  1710  	}
  1711  
  1712  	// Mark the job as stopped and try again
  1713  	job2 := job.Copy()
  1714  	job2.Stop = true
  1715  	err = state.UpsertJob(2000, job2)
  1716  	if err != nil {
  1717  		t.Fatalf("err: %v", err)
  1718  	}
  1719  
  1720  	// Create a core scheduler
  1721  	snap, err = state.Snapshot()
  1722  	if err != nil {
  1723  		t.Fatalf("err: %v", err)
  1724  	}
  1725  	core = NewCoreScheduler(s1, snap)
  1726  
  1727  	// Attempt the GC
  1728  	gc = s1.coreJobEval(structs.CoreJobForceGC, 2002)
  1729  	err = core.Process(gc)
  1730  	if err != nil {
  1731  		t.Fatalf("err: %v", err)
  1732  	}
  1733  
  1734  	// Should not exist
  1735  	out, err = state.JobByID(ws, job.Namespace, job.ID)
  1736  	if err != nil {
  1737  		t.Fatalf("err: %v", err)
  1738  	}
  1739  	if out != nil {
  1740  		t.Fatalf("bad: %+v", out)
  1741  	}
  1742  }
  1743  
  1744  func TestCoreScheduler_DeploymentGC(t *testing.T) {
  1745  	t.Parallel()
  1746  
  1747  	s1, cleanupS1 := TestServer(t, nil)
  1748  	defer cleanupS1()
  1749  	testutil.WaitForLeader(t, s1.RPC)
  1750  	assert := assert.New(t)
  1751  
  1752  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1753  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1754  
  1755  	// Insert an active, terminal, and terminal with allocations deployment
  1756  	state := s1.fsm.State()
  1757  	d1, d2, d3 := mock.Deployment(), mock.Deployment(), mock.Deployment()
  1758  	d1.Status = structs.DeploymentStatusFailed
  1759  	d3.Status = structs.DeploymentStatusSuccessful
  1760  	assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment")
  1761  	assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment")
  1762  	assert.Nil(state.UpsertDeployment(1002, d3), "UpsertDeployment")
  1763  
  1764  	a := mock.Alloc()
  1765  	a.JobID = d3.JobID
  1766  	a.DeploymentID = d3.ID
  1767  	assert.Nil(state.UpsertAllocs(1003, []*structs.Allocation{a}), "UpsertAllocs")
  1768  
  1769  	// Update the time tables to make this work
  1770  	tt := s1.fsm.TimeTable()
  1771  	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.DeploymentGCThreshold))
  1772  
  1773  	// Create a core scheduler
  1774  	snap, err := state.Snapshot()
  1775  	assert.Nil(err, "Snapshot")
  1776  	core := NewCoreScheduler(s1, snap)
  1777  
  1778  	// Attempt the GC
  1779  	gc := s1.coreJobEval(structs.CoreJobDeploymentGC, 2000)
  1780  	assert.Nil(core.Process(gc), "Process GC")
  1781  
  1782  	// Should be gone
  1783  	ws := memdb.NewWatchSet()
  1784  	out, err := state.DeploymentByID(ws, d1.ID)
  1785  	assert.Nil(err, "DeploymentByID")
  1786  	assert.Nil(out, "Terminal Deployment")
  1787  	out2, err := state.DeploymentByID(ws, d2.ID)
  1788  	assert.Nil(err, "DeploymentByID")
  1789  	assert.NotNil(out2, "Active Deployment")
  1790  	out3, err := state.DeploymentByID(ws, d3.ID)
  1791  	assert.Nil(err, "DeploymentByID")
  1792  	assert.NotNil(out3, "Terminal Deployment With Allocs")
  1793  }
  1794  
  1795  func TestCoreScheduler_DeploymentGC_Force(t *testing.T) {
  1796  	t.Parallel()
  1797  	for _, withAcl := range []bool{false, true} {
  1798  		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
  1799  			var server *Server
  1800  			var cleanup func()
  1801  			if withAcl {
  1802  				server, _, cleanup = TestACLServer(t, nil)
  1803  			} else {
  1804  				server, cleanup = TestServer(t, nil)
  1805  			}
  1806  			defer cleanup()
  1807  			testutil.WaitForLeader(t, server.RPC)
  1808  			assert := assert.New(t)
  1809  
  1810  			// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1811  			server.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1812  
  1813  			// Insert terminal and active deployment
  1814  			state := server.fsm.State()
  1815  			d1, d2 := mock.Deployment(), mock.Deployment()
  1816  			d1.Status = structs.DeploymentStatusFailed
  1817  			assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment")
  1818  			assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment")
  1819  
  1820  			// Create a core scheduler
  1821  			snap, err := state.Snapshot()
  1822  			assert.Nil(err, "Snapshot")
  1823  			core := NewCoreScheduler(server, snap)
  1824  
  1825  			// Attempt the GC
  1826  			gc := server.coreJobEval(structs.CoreJobForceGC, 1000)
  1827  			assert.Nil(core.Process(gc), "Process Force GC")
  1828  
  1829  			// Should be gone
  1830  			ws := memdb.NewWatchSet()
  1831  			out, err := state.DeploymentByID(ws, d1.ID)
  1832  			assert.Nil(err, "DeploymentByID")
  1833  			assert.Nil(out, "Terminal Deployment")
  1834  			out2, err := state.DeploymentByID(ws, d2.ID)
  1835  			assert.Nil(err, "DeploymentByID")
  1836  			assert.NotNil(out2, "Active Deployment")
  1837  		})
  1838  	}
  1839  }
  1840  
  1841  func TestCoreScheduler_PartitionEvalReap(t *testing.T) {
  1842  	t.Parallel()
  1843  
  1844  	s1, cleanupS1 := TestServer(t, nil)
  1845  	defer cleanupS1()
  1846  	testutil.WaitForLeader(t, s1.RPC)
  1847  
  1848  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1849  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1850  
  1851  	// Create a core scheduler
  1852  	snap, err := s1.fsm.State().Snapshot()
  1853  	if err != nil {
  1854  		t.Fatalf("err: %v", err)
  1855  	}
  1856  	core := NewCoreScheduler(s1, snap)
  1857  
  1858  	// Set the max ids per reap to something lower.
  1859  	maxIdsPerReap = 2
  1860  
  1861  	evals := []string{"a", "b", "c"}
  1862  	allocs := []string{"1", "2", "3"}
  1863  	requests := core.(*CoreScheduler).partitionEvalReap(evals, allocs)
  1864  	if len(requests) != 3 {
  1865  		t.Fatalf("Expected 3 requests got: %v", requests)
  1866  	}
  1867  
  1868  	first := requests[0]
  1869  	if len(first.Allocs) != 2 && len(first.Evals) != 0 {
  1870  		t.Fatalf("Unexpected first request: %v", first)
  1871  	}
  1872  
  1873  	second := requests[1]
  1874  	if len(second.Allocs) != 1 && len(second.Evals) != 1 {
  1875  		t.Fatalf("Unexpected second request: %v", second)
  1876  	}
  1877  
  1878  	third := requests[2]
  1879  	if len(third.Allocs) != 0 && len(third.Evals) != 2 {
  1880  		t.Fatalf("Unexpected third request: %v", third)
  1881  	}
  1882  }
  1883  
  1884  func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) {
  1885  	t.Parallel()
  1886  
  1887  	s1, cleanupS1 := TestServer(t, nil)
  1888  	defer cleanupS1()
  1889  	testutil.WaitForLeader(t, s1.RPC)
  1890  
  1891  	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
  1892  	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  1893  
  1894  	// Create a core scheduler
  1895  	snap, err := s1.fsm.State().Snapshot()
  1896  	if err != nil {
  1897  		t.Fatalf("err: %v", err)
  1898  	}
  1899  	core := NewCoreScheduler(s1, snap)
  1900  
  1901  	// Set the max ids per reap to something lower.
  1902  	maxIdsPerReap = 2
  1903  
  1904  	deployments := []string{"a", "b", "c"}
  1905  	requests := core.(*CoreScheduler).partitionDeploymentReap(deployments)
  1906  	if len(requests) != 2 {
  1907  		t.Fatalf("Expected 2 requests got: %v", requests)
  1908  	}
  1909  
  1910  	first := requests[0]
  1911  	if len(first.Deployments) != 2 {
  1912  		t.Fatalf("Unexpected first request: %v", first)
  1913  	}
  1914  
  1915  	second := requests[1]
  1916  	if len(second.Deployments) != 1 {
  1917  		t.Fatalf("Unexpected second request: %v", second)
  1918  	}
  1919  }
  1920  
  1921  func TestCoreScheduler_PartitionJobReap(t *testing.T) {
  1922  	t.Parallel()
  1923  	require := require.New(t)
  1924  
  1925  	s1, cleanupS1 := TestServer(t, nil)
  1926  	defer cleanupS1()
  1927  	testutil.WaitForLeader(t, s1.RPC)
  1928  
  1929  	// Create a core scheduler
  1930  	snap, err := s1.fsm.State().Snapshot()
  1931  	if err != nil {
  1932  		t.Fatalf("err: %v", err)
  1933  	}
  1934  	core := NewCoreScheduler(s1, snap)
  1935  
  1936  	// Set the max ids per reap to something lower.
  1937  	maxIdsPerReap = 2
  1938  
  1939  	jobs := []*structs.Job{mock.Job(), mock.Job(), mock.Job()}
  1940  	requests := core.(*CoreScheduler).partitionJobReap(jobs, "")
  1941  	require.Len(requests, 2)
  1942  
  1943  	first := requests[0]
  1944  	second := requests[1]
  1945  	require.Len(first.Jobs, 2)
  1946  	require.Len(second.Jobs, 1)
  1947  }
  1948  
  1949  // Tests various scenarios when allocations are eligible to be GCed
  1950  func TestAllocation_GCEligible(t *testing.T) {
  1951  	type testCase struct {
  1952  		Desc                string
  1953  		GCTime              time.Time
  1954  		ClientStatus        string
  1955  		DesiredStatus       string
  1956  		JobStatus           string
  1957  		JobStop             bool
  1958  		AllocJobModifyIndex uint64
  1959  		JobModifyIndex      uint64
  1960  		ModifyIndex         uint64
  1961  		NextAllocID         string
  1962  		ReschedulePolicy    *structs.ReschedulePolicy
  1963  		RescheduleTrackers  []*structs.RescheduleEvent
  1964  		ThresholdIndex      uint64
  1965  		ShouldGC            bool
  1966  	}
  1967  
  1968  	fail := time.Now()
  1969  
  1970  	harness := []testCase{
  1971  		{
  1972  			Desc:           "Don't GC when non terminal",
  1973  			ClientStatus:   structs.AllocClientStatusPending,
  1974  			DesiredStatus:  structs.AllocDesiredStatusRun,
  1975  			GCTime:         fail,
  1976  			ModifyIndex:    90,
  1977  			ThresholdIndex: 90,
  1978  			ShouldGC:       false,
  1979  		},
  1980  		{
  1981  			Desc:           "Don't GC when non terminal and job stopped",
  1982  			ClientStatus:   structs.AllocClientStatusPending,
  1983  			DesiredStatus:  structs.AllocDesiredStatusRun,
  1984  			JobStop:        true,
  1985  			GCTime:         fail,
  1986  			ModifyIndex:    90,
  1987  			ThresholdIndex: 90,
  1988  			ShouldGC:       false,
  1989  		},
  1990  		{
  1991  			Desc:           "Don't GC when non terminal and job dead",
  1992  			ClientStatus:   structs.AllocClientStatusPending,
  1993  			DesiredStatus:  structs.AllocDesiredStatusRun,
  1994  			JobStatus:      structs.JobStatusDead,
  1995  			GCTime:         fail,
  1996  			ModifyIndex:    90,
  1997  			ThresholdIndex: 90,
  1998  			ShouldGC:       false,
  1999  		},
  2000  		{
  2001  			Desc:           "Don't GC when non terminal on client and job dead",
  2002  			ClientStatus:   structs.AllocClientStatusRunning,
  2003  			DesiredStatus:  structs.AllocDesiredStatusStop,
  2004  			JobStatus:      structs.JobStatusDead,
  2005  			GCTime:         fail,
  2006  			ModifyIndex:    90,
  2007  			ThresholdIndex: 90,
  2008  			ShouldGC:       false,
  2009  		},
  2010  		{
  2011  			Desc:             "GC when terminal but not failed ",
  2012  			ClientStatus:     structs.AllocClientStatusComplete,
  2013  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2014  			GCTime:           fail,
  2015  			ModifyIndex:      90,
  2016  			ThresholdIndex:   90,
  2017  			ReschedulePolicy: nil,
  2018  			ShouldGC:         true,
  2019  		},
  2020  		{
  2021  			Desc:             "Don't GC when threshold not met",
  2022  			ClientStatus:     structs.AllocClientStatusComplete,
  2023  			DesiredStatus:    structs.AllocDesiredStatusStop,
  2024  			GCTime:           fail,
  2025  			ModifyIndex:      100,
  2026  			ThresholdIndex:   90,
  2027  			ReschedulePolicy: nil,
  2028  			ShouldGC:         false,
  2029  		},
  2030  		{
  2031  			Desc:             "GC when no reschedule policy",
  2032  			ClientStatus:     structs.AllocClientStatusFailed,
  2033  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2034  			GCTime:           fail,
  2035  			ReschedulePolicy: nil,
  2036  			ModifyIndex:      90,
  2037  			ThresholdIndex:   90,
  2038  			ShouldGC:         true,
  2039  		},
  2040  		{
  2041  			Desc:             "GC when empty policy",
  2042  			ClientStatus:     structs.AllocClientStatusFailed,
  2043  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2044  			GCTime:           fail,
  2045  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 0, Interval: 0 * time.Minute},
  2046  			ModifyIndex:      90,
  2047  			ThresholdIndex:   90,
  2048  			ShouldGC:         true,
  2049  		},
  2050  		{
  2051  			Desc:             "Don't GC when no previous reschedule attempts",
  2052  			ClientStatus:     structs.AllocClientStatusFailed,
  2053  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2054  			GCTime:           fail,
  2055  			ModifyIndex:      90,
  2056  			ThresholdIndex:   90,
  2057  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 1, Interval: 1 * time.Minute},
  2058  			ShouldGC:         false,
  2059  		},
  2060  		{
  2061  			Desc:             "Don't GC when prev reschedule attempt within interval",
  2062  			ClientStatus:     structs.AllocClientStatusFailed,
  2063  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2064  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute},
  2065  			GCTime:           fail,
  2066  			ModifyIndex:      90,
  2067  			ThresholdIndex:   90,
  2068  			RescheduleTrackers: []*structs.RescheduleEvent{
  2069  				{
  2070  					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
  2071  				},
  2072  			},
  2073  			ShouldGC: false,
  2074  		},
  2075  		{
  2076  			Desc:             "GC with prev reschedule attempt outside interval",
  2077  			ClientStatus:     structs.AllocClientStatusFailed,
  2078  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2079  			GCTime:           fail,
  2080  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  2081  			RescheduleTrackers: []*structs.RescheduleEvent{
  2082  				{
  2083  					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
  2084  				},
  2085  				{
  2086  					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
  2087  				},
  2088  			},
  2089  			ShouldGC: true,
  2090  		},
  2091  		{
  2092  			Desc:             "GC when next alloc id is set",
  2093  			ClientStatus:     structs.AllocClientStatusFailed,
  2094  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2095  			GCTime:           fail,
  2096  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  2097  			RescheduleTrackers: []*structs.RescheduleEvent{
  2098  				{
  2099  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  2100  				},
  2101  			},
  2102  			NextAllocID: uuid.Generate(),
  2103  			ShouldGC:    true,
  2104  		},
  2105  		{
  2106  			Desc:             "Don't GC when next alloc id is not set and unlimited restarts",
  2107  			ClientStatus:     structs.AllocClientStatusFailed,
  2108  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2109  			GCTime:           fail,
  2110  			ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"},
  2111  			RescheduleTrackers: []*structs.RescheduleEvent{
  2112  				{
  2113  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  2114  				},
  2115  			},
  2116  			ShouldGC: false,
  2117  		},
  2118  		{
  2119  			Desc:             "GC when job is stopped",
  2120  			ClientStatus:     structs.AllocClientStatusFailed,
  2121  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2122  			GCTime:           fail,
  2123  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  2124  			RescheduleTrackers: []*structs.RescheduleEvent{
  2125  				{
  2126  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  2127  				},
  2128  			},
  2129  			JobStop:  true,
  2130  			ShouldGC: true,
  2131  		},
  2132  		{
  2133  			Desc:             "GC when job status is dead",
  2134  			ClientStatus:     structs.AllocClientStatusFailed,
  2135  			DesiredStatus:    structs.AllocDesiredStatusRun,
  2136  			GCTime:           fail,
  2137  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  2138  			RescheduleTrackers: []*structs.RescheduleEvent{
  2139  				{
  2140  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  2141  				},
  2142  			},
  2143  			JobStatus: structs.JobStatusDead,
  2144  			ShouldGC:  true,
  2145  		},
  2146  		{
  2147  			Desc:             "GC when desired status is stop, unlimited reschedule policy, no previous reschedule events",
  2148  			ClientStatus:     structs.AllocClientStatusFailed,
  2149  			DesiredStatus:    structs.AllocDesiredStatusStop,
  2150  			GCTime:           fail,
  2151  			ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"},
  2152  			ShouldGC:         true,
  2153  		},
  2154  		{
  2155  			Desc:             "GC when desired status is stop, limited reschedule policy, some previous reschedule events",
  2156  			ClientStatus:     structs.AllocClientStatusFailed,
  2157  			DesiredStatus:    structs.AllocDesiredStatusStop,
  2158  			GCTime:           fail,
  2159  			ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute},
  2160  			RescheduleTrackers: []*structs.RescheduleEvent{
  2161  				{
  2162  					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
  2163  				},
  2164  			},
  2165  			ShouldGC: true,
  2166  		},
  2167  	}
  2168  
  2169  	for _, tc := range harness {
  2170  		alloc := &structs.Allocation{}
  2171  		alloc.ModifyIndex = tc.ModifyIndex
  2172  		alloc.DesiredStatus = tc.DesiredStatus
  2173  		alloc.ClientStatus = tc.ClientStatus
  2174  		alloc.RescheduleTracker = &structs.RescheduleTracker{Events: tc.RescheduleTrackers}
  2175  		alloc.NextAllocation = tc.NextAllocID
  2176  		job := mock.Job()
  2177  		alloc.TaskGroup = job.TaskGroups[0].Name
  2178  		job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
  2179  		if tc.JobStatus != "" {
  2180  			job.Status = tc.JobStatus
  2181  		}
  2182  		job.Stop = tc.JobStop
  2183  
  2184  		t.Run(tc.Desc, func(t *testing.T) {
  2185  			if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
  2186  				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
  2187  			}
  2188  		})
  2189  
  2190  	}
  2191  
  2192  	// Verify nil job
  2193  	require := require.New(t)
  2194  	alloc := mock.Alloc()
  2195  	alloc.ClientStatus = structs.AllocClientStatusComplete
  2196  	require.True(allocGCEligible(alloc, nil, time.Now(), 1000))
  2197  }
  2198  
  2199  func TestCoreScheduler_CSIPluginGC(t *testing.T) {
  2200  	t.Parallel()
  2201  
  2202  	srv, cleanupSRV := TestServer(t, nil)
  2203  	defer cleanupSRV()
  2204  	testutil.WaitForLeader(t, srv.RPC)
  2205  	require := require.New(t)
  2206  
  2207  	srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
  2208  
  2209  	deleteNodes := state.CreateTestCSIPlugin(srv.fsm.State(), "foo")
  2210  	defer deleteNodes()
  2211  	state := srv.fsm.State()
  2212  
  2213  	// Update the time tables to make this work
  2214  	tt := srv.fsm.TimeTable()
  2215  	index := uint64(2000)
  2216  	tt.Witness(index, time.Now().UTC().Add(-1*srv.config.CSIPluginGCThreshold))
  2217  
  2218  	// Create a core scheduler
  2219  	snap, err := state.Snapshot()
  2220  	require.NoError(err)
  2221  	core := NewCoreScheduler(srv, snap)
  2222  
  2223  	// Attempt the GC
  2224  	index++
  2225  	gc := srv.coreJobEval(structs.CoreJobCSIPluginGC, index)
  2226  	require.NoError(core.Process(gc))
  2227  
  2228  	// Should not be gone (plugin in use)
  2229  	ws := memdb.NewWatchSet()
  2230  	plug, err := state.CSIPluginByID(ws, "foo")
  2231  	require.NotNil(plug)
  2232  	require.NoError(err)
  2233  
  2234  	// Empty the plugin
  2235  	plug.Controllers = map[string]*structs.CSIInfo{}
  2236  	plug.Nodes = map[string]*structs.CSIInfo{}
  2237  
  2238  	index++
  2239  	err = state.UpsertCSIPlugin(index, plug)
  2240  	require.NoError(err)
  2241  
  2242  	// Retry
  2243  	index++
  2244  	gc = srv.coreJobEval(structs.CoreJobCSIPluginGC, index)
  2245  	require.NoError(core.Process(gc))
  2246  
  2247  	// Should be gone
  2248  	plug, err = state.CSIPluginByID(ws, "foo")
  2249  	require.Nil(plug)
  2250  	require.NoError(err)
  2251  }
  2252  
  2253  func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) {
  2254  	t.Parallel()
  2255  	require := require.New(t)
  2256  
  2257  	srv, shutdown := TestServer(t, func(c *Config) {
  2258  		c.NumSchedulers = 0 // Prevent automatic dequeue
  2259  	})
  2260  
  2261  	defer shutdown()
  2262  	testutil.WaitForLeader(t, srv.RPC)
  2263  	codec := rpcClient(t, srv)
  2264  
  2265  	index := uint64(1)
  2266  	volID := uuid.Generate()
  2267  	ns := structs.DefaultNamespace
  2268  	pluginID := "foo"
  2269  
  2270  	state := srv.fsm.State()
  2271  	ws := memdb.NewWatchSet()
  2272  
  2273  	index, _ = state.LatestIndex()
  2274  
  2275  	// Create client node and plugin
  2276  	node := mock.Node()
  2277  	node.Attributes["nomad.version"] = "0.11.0" // needs client RPCs
  2278  	node.CSINodePlugins = map[string]*structs.CSIInfo{
  2279  		pluginID: {
  2280  			PluginID: pluginID,
  2281  			Healthy:  true,
  2282  			NodeInfo: &structs.CSINodeInfo{},
  2283  		},
  2284  	}
  2285  	index++
  2286  	err := state.UpsertNode(index, node)
  2287  	require.NoError(err)
  2288  
  2289  	// Note that for volume writes in this test we need to use the
  2290  	// RPCs rather than StateStore methods directly so that the GC
  2291  	// job's RPC call updates a later index. otherwise the
  2292  	// volumewatcher won't trigger for the final GC
  2293  
  2294  	// Register a volume
  2295  	vols := []*structs.CSIVolume{{
  2296  		ID:             volID,
  2297  		Namespace:      ns,
  2298  		PluginID:       pluginID,
  2299  		AccessMode:     structs.CSIVolumeAccessModeMultiNodeSingleWriter,
  2300  		AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
  2301  		Topologies:     []*structs.CSITopology{},
  2302  	}}
  2303  	volReq := &structs.CSIVolumeRegisterRequest{Volumes: vols}
  2304  	volReq.Namespace = ns
  2305  	volReq.Region = srv.config.Region
  2306  
  2307  	err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Register",
  2308  		volReq, &structs.CSIVolumeRegisterResponse{})
  2309  	require.NoError(err)
  2310  
  2311  	// Create a job with two allocations that claim the volume.
  2312  	// We use two allocs here, one of which is not running, so
  2313  	// that we can assert that the volumewatcher has made one
  2314  	// complete pass (and removed the 2nd alloc) before running
  2315  	// the GC.
  2316  	eval := mock.Eval()
  2317  	eval.Status = structs.EvalStatusFailed
  2318  	index++
  2319  	state.UpsertJobSummary(index, mock.JobSummary(eval.JobID))
  2320  	index++
  2321  	err = state.UpsertEvals(index, []*structs.Evaluation{eval})
  2322  	require.Nil(err)
  2323  
  2324  	job := mock.Job()
  2325  	job.ID = eval.JobID
  2326  	job.Status = structs.JobStatusRunning
  2327  	index++
  2328  	err = state.UpsertJob(index, job)
  2329  	require.NoError(err)
  2330  
  2331  	alloc1, alloc2 := mock.Alloc(), mock.Alloc()
  2332  	alloc1.NodeID = node.ID
  2333  	alloc1.ClientStatus = structs.AllocClientStatusRunning
  2334  	alloc1.Job = job
  2335  	alloc1.JobID = job.ID
  2336  	alloc1.EvalID = eval.ID
  2337  
  2338  	alloc2.NodeID = node.ID
  2339  	alloc2.ClientStatus = structs.AllocClientStatusComplete
  2340  	alloc2.Job = job
  2341  	alloc2.JobID = job.ID
  2342  	alloc2.EvalID = eval.ID
  2343  
  2344  	summary := mock.JobSummary(alloc1.JobID)
  2345  	index++
  2346  	require.NoError(state.UpsertJobSummary(index, summary))
  2347  	summary = mock.JobSummary(alloc2.JobID)
  2348  	index++
  2349  	require.NoError(state.UpsertJobSummary(index, summary))
  2350  	index++
  2351  	require.NoError(state.UpsertAllocs(index,
  2352  		[]*structs.Allocation{alloc1, alloc2}))
  2353  
  2354  	// Claim the volume for the alloc
  2355  	req := &structs.CSIVolumeClaimRequest{
  2356  		AllocationID: alloc1.ID,
  2357  		NodeID:       node.ID,
  2358  		VolumeID:     volID,
  2359  		Claim:        structs.CSIVolumeClaimWrite,
  2360  	}
  2361  	req.Namespace = ns
  2362  	req.Region = srv.config.Region
  2363  	err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Claim",
  2364  		req, &structs.CSIVolumeClaimResponse{})
  2365  	require.NoError(err)
  2366  
  2367  	// ready-to-free claim; once it's gone we know the volumewatcher
  2368  	// has run once and stopped
  2369  	req.AllocationID = alloc2.ID
  2370  	req.Claim = structs.CSIVolumeClaimRelease
  2371  	req.State = structs.CSIVolumeClaimStateControllerDetached
  2372  	err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Claim",
  2373  		req, &structs.CSIVolumeClaimResponse{})
  2374  	require.NoError(err)
  2375  
  2376  	// wait for volumewatcher
  2377  	var vol *structs.CSIVolume
  2378  	require.Eventually(func() bool {
  2379  		vol, _ = state.CSIVolumeByID(ws, ns, volID)
  2380  		return len(vol.ReadAllocs) == 0 &&
  2381  			len(vol.ReadClaims) == 0 &&
  2382  			len(vol.PastClaims) == 0
  2383  	}, time.Second*1, 10*time.Millisecond, "stale claim was not released")
  2384  
  2385  	// Delete allocation and job
  2386  	index++
  2387  	err = state.DeleteJob(index, ns, job.ID)
  2388  	require.NoError(err)
  2389  	index++
  2390  	err = state.DeleteEval(index, []string{eval.ID}, []string{alloc1.ID, alloc2.ID})
  2391  	require.NoError(err)
  2392  
  2393  	// Create a core scheduler and attempt the volume claim GC
  2394  	snap, err := state.Snapshot()
  2395  	require.NoError(err)
  2396  	core := NewCoreScheduler(srv, snap)
  2397  
  2398  	index++
  2399  	gc := srv.coreJobEval(structs.CoreJobForceGC, index)
  2400  	c := core.(*CoreScheduler)
  2401  	require.NoError(c.csiVolumeClaimGC(gc))
  2402  
  2403  	// the volumewatcher will hit an error here because there's no
  2404  	// path to the node.  but we can't update the claim to bypass the
  2405  	// client RPCs without triggering the volumewatcher's normal code
  2406  	// path.
  2407  	require.Eventually(func() bool {
  2408  		vol, _ = state.CSIVolumeByID(ws, ns, volID)
  2409  		return len(vol.WriteClaims) == 1 &&
  2410  			len(vol.WriteAllocs) == 1 &&
  2411  			len(vol.PastClaims) == 0
  2412  	}, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly")
  2413  
  2414  	req.AllocationID = alloc1.ID
  2415  	err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Claim",
  2416  		req, &structs.CSIVolumeClaimResponse{})
  2417  	require.NoError(err)
  2418  
  2419  	// wait for volumewatcher
  2420  	require.Eventually(func() bool {
  2421  		vol, _ = state.CSIVolumeByID(ws, ns, volID)
  2422  		return len(vol.WriteClaims) == 0 &&
  2423  			len(vol.WriteAllocs) == 0 &&
  2424  			len(vol.PastClaims) == 0
  2425  	}, time.Second*1, 10*time.Millisecond, "claims were not released")
  2426  
  2427  }