github.com/adityamillind98/nomad@v0.11.8/nomad/core_sched_test.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "testing" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/mock" 12 "github.com/hashicorp/nomad/nomad/state" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/nomad/testutil" 15 "github.com/stretchr/testify/assert" 16 "github.com/stretchr/testify/require" 17 ) 18 19 func TestCoreScheduler_EvalGC(t *testing.T) { 20 t.Parallel() 21 22 s1, cleanupS1 := TestServer(t, nil) 23 defer cleanupS1() 24 testutil.WaitForLeader(t, s1.RPC) 25 require := require.New(t) 26 27 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 28 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 29 30 // Insert "dead" eval 31 state := s1.fsm.State() 32 eval := mock.Eval() 33 eval.Status = structs.EvalStatusFailed 34 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 35 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 36 require.Nil(err) 37 38 // Insert mock job with rescheduling disabled 39 job := mock.Job() 40 job.ID = eval.JobID 41 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 42 Attempts: 0, 43 Interval: 0 * time.Second, 44 } 45 err = state.UpsertJob(1001, job) 46 require.Nil(err) 47 48 // Insert "dead" alloc 49 alloc := mock.Alloc() 50 alloc.EvalID = eval.ID 51 alloc.DesiredStatus = structs.AllocDesiredStatusStop 52 alloc.JobID = eval.JobID 53 alloc.TaskGroup = job.TaskGroups[0].Name 54 55 // Insert "lost" alloc 56 alloc2 := mock.Alloc() 57 alloc2.EvalID = eval.ID 58 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 59 alloc2.ClientStatus = structs.AllocClientStatusLost 60 alloc2.JobID = eval.JobID 61 alloc2.TaskGroup = job.TaskGroups[0].Name 62 err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) 63 if err != nil { 64 t.Fatalf("err: %v", err) 65 } 66 67 // Update the time tables to make this work 68 tt := s1.fsm.TimeTable() 69 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 70 71 // Create a core scheduler 72 snap, err := state.Snapshot() 73 if err != nil { 74 t.Fatalf("err: %v", err) 75 } 76 core := NewCoreScheduler(s1, snap) 77 78 // Attempt the GC 79 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 80 err = core.Process(gc) 81 if err != nil { 82 t.Fatalf("err: %v", err) 83 } 84 85 // Should be gone 86 ws := memdb.NewWatchSet() 87 out, err := state.EvalByID(ws, eval.ID) 88 if err != nil { 89 t.Fatalf("err: %v", err) 90 } 91 if out != nil { 92 t.Fatalf("bad: %v", out) 93 } 94 95 outA, err := state.AllocByID(ws, alloc.ID) 96 if err != nil { 97 t.Fatalf("err: %v", err) 98 } 99 if outA != nil { 100 t.Fatalf("bad: %v", outA) 101 } 102 103 outA2, err := state.AllocByID(ws, alloc2.ID) 104 if err != nil { 105 t.Fatalf("err: %v", err) 106 } 107 if outA2 != nil { 108 t.Fatalf("bad: %v", outA2) 109 } 110 } 111 112 // Tests GC behavior on allocations being rescheduled 113 func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) { 114 t.Parallel() 115 116 s1, cleanupS1 := TestServer(t, nil) 117 defer cleanupS1() 118 testutil.WaitForLeader(t, s1.RPC) 119 require := require.New(t) 120 121 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 122 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 123 124 // Insert "dead" eval 125 state := s1.fsm.State() 126 eval := mock.Eval() 127 eval.Status = structs.EvalStatusFailed 128 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 129 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 130 require.Nil(err) 131 132 // Insert "pending" eval for same job 133 eval2 := mock.Eval() 134 eval2.JobID = eval.JobID 135 state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID)) 136 err = state.UpsertEvals(1003, []*structs.Evaluation{eval2}) 137 require.Nil(err) 138 139 // Insert mock job with default reschedule policy of 2 in 10 minutes 140 job := mock.Job() 141 job.ID = eval.JobID 142 143 err = state.UpsertJob(1001, job) 144 require.Nil(err) 145 146 // Insert failed alloc with an old reschedule attempt, can be GCed 147 alloc := mock.Alloc() 148 alloc.Job = job 149 alloc.EvalID = eval.ID 150 alloc.DesiredStatus = structs.AllocDesiredStatusRun 151 alloc.ClientStatus = structs.AllocClientStatusFailed 152 alloc.JobID = eval.JobID 153 alloc.TaskGroup = job.TaskGroups[0].Name 154 alloc.NextAllocation = uuid.Generate() 155 alloc.RescheduleTracker = &structs.RescheduleTracker{ 156 Events: []*structs.RescheduleEvent{ 157 { 158 RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), 159 PrevNodeID: uuid.Generate(), 160 PrevAllocID: uuid.Generate(), 161 }, 162 }, 163 } 164 165 alloc2 := mock.Alloc() 166 alloc2.Job = job 167 alloc2.EvalID = eval.ID 168 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 169 alloc2.ClientStatus = structs.AllocClientStatusFailed 170 alloc2.JobID = eval.JobID 171 alloc2.TaskGroup = job.TaskGroups[0].Name 172 alloc2.RescheduleTracker = &structs.RescheduleTracker{ 173 Events: []*structs.RescheduleEvent{ 174 { 175 RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(), 176 PrevNodeID: uuid.Generate(), 177 PrevAllocID: uuid.Generate(), 178 }, 179 }, 180 } 181 err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) 182 require.Nil(err) 183 184 // Update the time tables to make this work 185 tt := s1.fsm.TimeTable() 186 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 187 188 // Create a core scheduler 189 snap, err := state.Snapshot() 190 if err != nil { 191 t.Fatalf("err: %v", err) 192 } 193 core := NewCoreScheduler(s1, snap) 194 195 // Attempt the GC, job has all terminal allocs and one pending eval 196 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 197 err = core.Process(gc) 198 require.Nil(err) 199 200 // Eval should still exist 201 ws := memdb.NewWatchSet() 202 out, err := state.EvalByID(ws, eval.ID) 203 require.Nil(err) 204 require.NotNil(out) 205 require.Equal(eval.ID, out.ID) 206 207 outA, err := state.AllocByID(ws, alloc.ID) 208 require.Nil(err) 209 require.Nil(outA) 210 211 outA2, err := state.AllocByID(ws, alloc2.ID) 212 require.Nil(err) 213 require.Equal(alloc2.ID, outA2.ID) 214 215 } 216 217 // Tests GC behavior on stopped job with reschedulable allocs 218 func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) { 219 t.Parallel() 220 221 s1, cleanupS1 := TestServer(t, nil) 222 defer cleanupS1() 223 testutil.WaitForLeader(t, s1.RPC) 224 require := require.New(t) 225 226 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 227 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 228 229 // Insert "dead" eval 230 state := s1.fsm.State() 231 eval := mock.Eval() 232 eval.Status = structs.EvalStatusFailed 233 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 234 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 235 require.Nil(err) 236 237 // Insert mock stopped job with default reschedule policy of 2 in 10 minutes 238 job := mock.Job() 239 job.ID = eval.JobID 240 job.Stop = true 241 242 err = state.UpsertJob(1001, job) 243 require.Nil(err) 244 245 // Insert failed alloc with a recent reschedule attempt 246 alloc := mock.Alloc() 247 alloc.EvalID = eval.ID 248 alloc.DesiredStatus = structs.AllocDesiredStatusRun 249 alloc.ClientStatus = structs.AllocClientStatusLost 250 alloc.JobID = eval.JobID 251 alloc.TaskGroup = job.TaskGroups[0].Name 252 alloc.RescheduleTracker = &structs.RescheduleTracker{ 253 Events: []*structs.RescheduleEvent{ 254 { 255 RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(), 256 PrevNodeID: uuid.Generate(), 257 PrevAllocID: uuid.Generate(), 258 }, 259 }, 260 } 261 err = state.UpsertAllocs(1001, []*structs.Allocation{alloc}) 262 require.Nil(err) 263 264 // Update the time tables to make this work 265 tt := s1.fsm.TimeTable() 266 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 267 268 // Create a core scheduler 269 snap, err := state.Snapshot() 270 if err != nil { 271 t.Fatalf("err: %v", err) 272 } 273 core := NewCoreScheduler(s1, snap) 274 275 // Attempt the GC 276 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 277 err = core.Process(gc) 278 require.Nil(err) 279 280 // Eval should not exist 281 ws := memdb.NewWatchSet() 282 out, err := state.EvalByID(ws, eval.ID) 283 require.Nil(err) 284 require.Nil(out) 285 286 // Alloc should not exist 287 outA, err := state.AllocByID(ws, alloc.ID) 288 require.Nil(err) 289 require.Nil(outA) 290 291 } 292 293 // An EvalGC should never reap a batch job that has not been stopped 294 func TestCoreScheduler_EvalGC_Batch(t *testing.T) { 295 t.Parallel() 296 297 s1, cleanupS1 := TestServer(t, nil) 298 defer cleanupS1() 299 testutil.WaitForLeader(t, s1.RPC) 300 301 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 302 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 303 304 // Insert a "dead" job 305 state := s1.fsm.State() 306 job := mock.Job() 307 job.Type = structs.JobTypeBatch 308 job.Status = structs.JobStatusDead 309 err := state.UpsertJob(1000, job) 310 if err != nil { 311 t.Fatalf("err: %v", err) 312 } 313 314 // Insert "complete" eval 315 eval := mock.Eval() 316 eval.Status = structs.EvalStatusComplete 317 eval.Type = structs.JobTypeBatch 318 eval.JobID = job.ID 319 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 320 if err != nil { 321 t.Fatalf("err: %v", err) 322 } 323 324 // Insert "failed" alloc 325 alloc := mock.Alloc() 326 alloc.Job = job 327 alloc.JobID = job.ID 328 alloc.EvalID = eval.ID 329 alloc.DesiredStatus = structs.AllocDesiredStatusStop 330 331 // Insert "lost" alloc 332 alloc2 := mock.Alloc() 333 alloc2.Job = job 334 alloc2.JobID = job.ID 335 alloc2.EvalID = eval.ID 336 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 337 alloc2.ClientStatus = structs.AllocClientStatusLost 338 339 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 340 if err != nil { 341 t.Fatalf("err: %v", err) 342 } 343 344 // Update the time tables to make this work 345 tt := s1.fsm.TimeTable() 346 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 347 348 // Create a core scheduler 349 snap, err := state.Snapshot() 350 if err != nil { 351 t.Fatalf("err: %v", err) 352 } 353 core := NewCoreScheduler(s1, snap) 354 355 // Attempt the GC 356 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 357 err = core.Process(gc) 358 if err != nil { 359 t.Fatalf("err: %v", err) 360 } 361 362 // Nothing should be gone 363 ws := memdb.NewWatchSet() 364 out, err := state.EvalByID(ws, eval.ID) 365 if err != nil { 366 t.Fatalf("err: %v", err) 367 } 368 if out == nil { 369 t.Fatalf("bad: %v", out) 370 } 371 372 outA, err := state.AllocByID(ws, alloc.ID) 373 if err != nil { 374 t.Fatalf("err: %v", err) 375 } 376 if outA == nil { 377 t.Fatalf("bad: %v", outA) 378 } 379 380 outA2, err := state.AllocByID(ws, alloc2.ID) 381 if err != nil { 382 t.Fatalf("err: %v", err) 383 } 384 if outA2 == nil { 385 t.Fatalf("bad: %v", outA2) 386 } 387 388 outB, err := state.JobByID(ws, job.Namespace, job.ID) 389 if err != nil { 390 t.Fatalf("err: %v", err) 391 } 392 if outB == nil { 393 t.Fatalf("bad: %v", outB) 394 } 395 } 396 397 // An EvalGC should reap allocations from jobs with an older modify index 398 func TestCoreScheduler_EvalGC_Batch_OldVersion(t *testing.T) { 399 t.Parallel() 400 401 s1, cleanupS1 := TestServer(t, nil) 402 defer cleanupS1() 403 testutil.WaitForLeader(t, s1.RPC) 404 405 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 406 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 407 408 // Insert a "dead" job 409 state := s1.fsm.State() 410 job := mock.Job() 411 job.Type = structs.JobTypeBatch 412 job.Status = structs.JobStatusDead 413 err := state.UpsertJob(1000, job) 414 if err != nil { 415 t.Fatalf("err: %v", err) 416 } 417 418 // Insert "complete" eval 419 eval := mock.Eval() 420 eval.Status = structs.EvalStatusComplete 421 eval.Type = structs.JobTypeBatch 422 eval.JobID = job.ID 423 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 424 if err != nil { 425 t.Fatalf("err: %v", err) 426 } 427 428 // Insert "failed" alloc 429 alloc := mock.Alloc() 430 alloc.Job = job 431 alloc.JobID = job.ID 432 alloc.EvalID = eval.ID 433 alloc.DesiredStatus = structs.AllocDesiredStatusStop 434 435 // Insert "lost" alloc 436 alloc2 := mock.Alloc() 437 alloc2.Job = job 438 alloc2.JobID = job.ID 439 alloc2.EvalID = eval.ID 440 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 441 alloc2.ClientStatus = structs.AllocClientStatusLost 442 443 // Insert alloc with older job modifyindex 444 alloc3 := mock.Alloc() 445 job2 := job.Copy() 446 447 alloc3.Job = job2 448 alloc3.JobID = job2.ID 449 alloc3.EvalID = eval.ID 450 job2.CreateIndex = 500 451 alloc3.DesiredStatus = structs.AllocDesiredStatusRun 452 alloc3.ClientStatus = structs.AllocClientStatusLost 453 454 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2, alloc3}) 455 if err != nil { 456 t.Fatalf("err: %v", err) 457 } 458 459 // Update the time tables to make this work 460 tt := s1.fsm.TimeTable() 461 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 462 463 // Create a core scheduler 464 snap, err := state.Snapshot() 465 if err != nil { 466 t.Fatalf("err: %v", err) 467 } 468 core := NewCoreScheduler(s1, snap) 469 470 // Attempt the GC 471 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 472 err = core.Process(gc) 473 if err != nil { 474 t.Fatalf("err: %v", err) 475 } 476 477 // Alloc1 and 2 should be there, and alloc3 should be gone 478 ws := memdb.NewWatchSet() 479 out, err := state.EvalByID(ws, eval.ID) 480 if err != nil { 481 t.Fatalf("err: %v", err) 482 } 483 if out == nil { 484 t.Fatalf("bad: %v", out) 485 } 486 487 outA, err := state.AllocByID(ws, alloc.ID) 488 if err != nil { 489 t.Fatalf("err: %v", err) 490 } 491 if outA == nil { 492 t.Fatalf("bad: %v", outA) 493 } 494 495 outA2, err := state.AllocByID(ws, alloc2.ID) 496 if err != nil { 497 t.Fatalf("err: %v", err) 498 } 499 if outA2 == nil { 500 t.Fatalf("bad: %v", outA2) 501 } 502 503 outA3, err := state.AllocByID(ws, alloc3.ID) 504 if err != nil { 505 t.Fatalf("err: %v", err) 506 } 507 if outA3 != nil { 508 t.Fatalf("expected alloc to be nil:%v", outA2) 509 } 510 511 outB, err := state.JobByID(ws, job.Namespace, job.ID) 512 if err != nil { 513 t.Fatalf("err: %v", err) 514 } 515 if outB == nil { 516 t.Fatalf("bad: %v", outB) 517 } 518 } 519 520 // An EvalGC should reap a batch job that has been stopped 521 func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) { 522 t.Parallel() 523 524 s1, cleanupS1 := TestServer(t, nil) 525 defer cleanupS1() 526 testutil.WaitForLeader(t, s1.RPC) 527 528 require := require.New(t) 529 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 530 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 531 532 // Create a "dead" job 533 state := s1.fsm.State() 534 job := mock.Job() 535 job.Type = structs.JobTypeBatch 536 job.Status = structs.JobStatusDead 537 job.Stop = true 538 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 539 Attempts: 0, 540 Interval: 0 * time.Second, 541 } 542 err := state.UpsertJob(1001, job) 543 require.Nil(err) 544 545 // Insert "complete" eval 546 eval := mock.Eval() 547 eval.Status = structs.EvalStatusComplete 548 eval.Type = structs.JobTypeBatch 549 eval.JobID = job.ID 550 err = state.UpsertEvals(1002, []*structs.Evaluation{eval}) 551 require.Nil(err) 552 553 // Insert "failed" alloc 554 alloc := mock.Alloc() 555 alloc.JobID = job.ID 556 alloc.EvalID = eval.ID 557 alloc.TaskGroup = job.TaskGroups[0].Name 558 alloc.DesiredStatus = structs.AllocDesiredStatusStop 559 560 // Insert "lost" alloc 561 alloc2 := mock.Alloc() 562 alloc2.JobID = job.ID 563 alloc2.EvalID = eval.ID 564 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 565 alloc2.ClientStatus = structs.AllocClientStatusLost 566 alloc2.TaskGroup = job.TaskGroups[0].Name 567 568 err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2}) 569 if err != nil { 570 t.Fatalf("err: %v", err) 571 } 572 573 // Update the time tables to make this work 574 tt := s1.fsm.TimeTable() 575 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 576 577 // Create a core scheduler 578 snap, err := state.Snapshot() 579 if err != nil { 580 t.Fatalf("err: %v", err) 581 } 582 core := NewCoreScheduler(s1, snap) 583 584 // Attempt the GC 585 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 586 err = core.Process(gc) 587 if err != nil { 588 t.Fatalf("err: %v", err) 589 } 590 591 // Everything should be gone 592 ws := memdb.NewWatchSet() 593 out, err := state.EvalByID(ws, eval.ID) 594 if err != nil { 595 t.Fatalf("err: %v", err) 596 } 597 if out != nil { 598 t.Fatalf("bad: %v", out) 599 } 600 601 outA, err := state.AllocByID(ws, alloc.ID) 602 if err != nil { 603 t.Fatalf("err: %v", err) 604 } 605 if outA != nil { 606 t.Fatalf("bad: %v", outA) 607 } 608 609 outA2, err := state.AllocByID(ws, alloc2.ID) 610 if err != nil { 611 t.Fatalf("err: %v", err) 612 } 613 if outA2 != nil { 614 t.Fatalf("bad: %v", outA2) 615 } 616 } 617 618 func TestCoreScheduler_EvalGC_Partial(t *testing.T) { 619 t.Parallel() 620 621 s1, cleanupS1 := TestServer(t, nil) 622 defer cleanupS1() 623 testutil.WaitForLeader(t, s1.RPC) 624 require := require.New(t) 625 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 626 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 627 628 // Insert "dead" eval 629 state := s1.fsm.State() 630 eval := mock.Eval() 631 eval.Status = structs.EvalStatusComplete 632 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 633 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 634 if err != nil { 635 t.Fatalf("err: %v", err) 636 } 637 638 // Create mock job with id same as eval 639 job := mock.Job() 640 job.ID = eval.JobID 641 642 // Insert "dead" alloc 643 alloc := mock.Alloc() 644 alloc.JobID = job.ID 645 alloc.EvalID = eval.ID 646 alloc.DesiredStatus = structs.AllocDesiredStatusStop 647 alloc.TaskGroup = job.TaskGroups[0].Name 648 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 649 650 // Insert "lost" alloc 651 alloc2 := mock.Alloc() 652 alloc2.JobID = job.ID 653 alloc2.EvalID = eval.ID 654 alloc2.TaskGroup = job.TaskGroups[0].Name 655 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 656 alloc2.ClientStatus = structs.AllocClientStatusLost 657 658 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 659 if err != nil { 660 t.Fatalf("err: %v", err) 661 } 662 663 // Insert "running" alloc 664 alloc3 := mock.Alloc() 665 alloc3.EvalID = eval.ID 666 alloc3.JobID = job.ID 667 state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID)) 668 err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3}) 669 if err != nil { 670 t.Fatalf("err: %v", err) 671 } 672 673 // Insert mock job with rescheduling disabled 674 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 675 Attempts: 0, 676 Interval: 0 * time.Second, 677 } 678 err = state.UpsertJob(1001, job) 679 require.Nil(err) 680 681 // Update the time tables to make this work 682 tt := s1.fsm.TimeTable() 683 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 684 685 // Create a core scheduler 686 snap, err := state.Snapshot() 687 if err != nil { 688 t.Fatalf("err: %v", err) 689 } 690 core := NewCoreScheduler(s1, snap) 691 692 // Attempt the GC 693 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 694 err = core.Process(gc) 695 if err != nil { 696 t.Fatalf("err: %v", err) 697 } 698 699 // Should not be gone 700 ws := memdb.NewWatchSet() 701 out, err := state.EvalByID(ws, eval.ID) 702 if err != nil { 703 t.Fatalf("err: %v", err) 704 } 705 if out == nil { 706 t.Fatalf("bad: %v", out) 707 } 708 709 outA, err := state.AllocByID(ws, alloc3.ID) 710 if err != nil { 711 t.Fatalf("err: %v", err) 712 } 713 if outA == nil { 714 t.Fatalf("bad: %v", outA) 715 } 716 717 // Should be gone 718 outB, err := state.AllocByID(ws, alloc.ID) 719 if err != nil { 720 t.Fatalf("err: %v", err) 721 } 722 if outB != nil { 723 t.Fatalf("bad: %v", outB) 724 } 725 726 outC, err := state.AllocByID(ws, alloc2.ID) 727 if err != nil { 728 t.Fatalf("err: %v", err) 729 } 730 if outC != nil { 731 t.Fatalf("bad: %v", outC) 732 } 733 } 734 735 func TestCoreScheduler_EvalGC_Force(t *testing.T) { 736 t.Parallel() 737 for _, withAcl := range []bool{false, true} { 738 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 739 require := require.New(t) 740 var server *Server 741 var cleanup func() 742 if withAcl { 743 server, _, cleanup = TestACLServer(t, nil) 744 } else { 745 server, cleanup = TestServer(t, nil) 746 } 747 defer cleanup() 748 testutil.WaitForLeader(t, server.RPC) 749 750 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 751 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 752 753 // Insert "dead" eval 754 state := server.fsm.State() 755 eval := mock.Eval() 756 eval.Status = structs.EvalStatusFailed 757 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 758 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 759 if err != nil { 760 t.Fatalf("err: %v", err) 761 } 762 763 // Insert mock job with rescheduling disabled 764 job := mock.Job() 765 job.ID = eval.JobID 766 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 767 Attempts: 0, 768 Interval: 0 * time.Second, 769 } 770 err = state.UpsertJob(1001, job) 771 require.Nil(err) 772 773 // Insert "dead" alloc 774 alloc := mock.Alloc() 775 alloc.EvalID = eval.ID 776 alloc.DesiredStatus = structs.AllocDesiredStatusStop 777 alloc.TaskGroup = job.TaskGroups[0].Name 778 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 779 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc}) 780 if err != nil { 781 t.Fatalf("err: %v", err) 782 } 783 784 // Create a core scheduler 785 snap, err := state.Snapshot() 786 if err != nil { 787 t.Fatalf("err: %v", err) 788 } 789 core := NewCoreScheduler(server, snap) 790 791 // Attempt the GC 792 gc := server.coreJobEval(structs.CoreJobForceGC, 1002) 793 err = core.Process(gc) 794 if err != nil { 795 t.Fatalf("err: %v", err) 796 } 797 798 // Should be gone 799 ws := memdb.NewWatchSet() 800 out, err := state.EvalByID(ws, eval.ID) 801 if err != nil { 802 t.Fatalf("err: %v", err) 803 } 804 if out != nil { 805 t.Fatalf("bad: %v", out) 806 } 807 808 outA, err := state.AllocByID(ws, alloc.ID) 809 if err != nil { 810 t.Fatalf("err: %v", err) 811 } 812 if outA != nil { 813 t.Fatalf("bad: %v", outA) 814 } 815 }) 816 } 817 } 818 819 func TestCoreScheduler_NodeGC(t *testing.T) { 820 t.Parallel() 821 for _, withAcl := range []bool{false, true} { 822 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 823 var server *Server 824 var cleanup func() 825 if withAcl { 826 server, _, cleanup = TestACLServer(t, nil) 827 } else { 828 server, cleanup = TestServer(t, nil) 829 } 830 defer cleanup() 831 testutil.WaitForLeader(t, server.RPC) 832 833 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 834 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 835 836 // Insert "dead" node 837 state := server.fsm.State() 838 node := mock.Node() 839 node.Status = structs.NodeStatusDown 840 err := state.UpsertNode(1000, node) 841 if err != nil { 842 t.Fatalf("err: %v", err) 843 } 844 845 // Update the time tables to make this work 846 tt := server.fsm.TimeTable() 847 tt.Witness(2000, time.Now().UTC().Add(-1*server.config.NodeGCThreshold)) 848 849 // Create a core scheduler 850 snap, err := state.Snapshot() 851 if err != nil { 852 t.Fatalf("err: %v", err) 853 } 854 core := NewCoreScheduler(server, snap) 855 856 // Attempt the GC 857 gc := server.coreJobEval(structs.CoreJobNodeGC, 2000) 858 err = core.Process(gc) 859 if err != nil { 860 t.Fatalf("err: %v", err) 861 } 862 863 // Should be gone 864 ws := memdb.NewWatchSet() 865 out, err := state.NodeByID(ws, node.ID) 866 if err != nil { 867 t.Fatalf("err: %v", err) 868 } 869 if out != nil { 870 t.Fatalf("bad: %v", out) 871 } 872 }) 873 } 874 } 875 876 func TestCoreScheduler_NodeGC_TerminalAllocs(t *testing.T) { 877 t.Parallel() 878 879 s1, cleanupS1 := TestServer(t, nil) 880 defer cleanupS1() 881 testutil.WaitForLeader(t, s1.RPC) 882 883 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 884 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 885 886 // Insert "dead" node 887 state := s1.fsm.State() 888 node := mock.Node() 889 node.Status = structs.NodeStatusDown 890 err := state.UpsertNode(1000, node) 891 if err != nil { 892 t.Fatalf("err: %v", err) 893 } 894 895 // Insert a terminal alloc on that node 896 alloc := mock.Alloc() 897 alloc.DesiredStatus = structs.AllocDesiredStatusStop 898 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 899 if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil { 900 t.Fatalf("err: %v", err) 901 } 902 903 // Update the time tables to make this work 904 tt := s1.fsm.TimeTable() 905 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold)) 906 907 // Create a core scheduler 908 snap, err := state.Snapshot() 909 if err != nil { 910 t.Fatalf("err: %v", err) 911 } 912 core := NewCoreScheduler(s1, snap) 913 914 // Attempt the GC 915 gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000) 916 err = core.Process(gc) 917 if err != nil { 918 t.Fatalf("err: %v", err) 919 } 920 921 // Should be gone 922 ws := memdb.NewWatchSet() 923 out, err := state.NodeByID(ws, node.ID) 924 if err != nil { 925 t.Fatalf("err: %v", err) 926 } 927 if out != nil { 928 t.Fatalf("bad: %v", out) 929 } 930 } 931 932 func TestCoreScheduler_NodeGC_RunningAllocs(t *testing.T) { 933 t.Parallel() 934 935 s1, cleanupS1 := TestServer(t, nil) 936 defer cleanupS1() 937 testutil.WaitForLeader(t, s1.RPC) 938 939 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 940 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 941 942 // Insert "dead" node 943 state := s1.fsm.State() 944 node := mock.Node() 945 node.Status = structs.NodeStatusDown 946 err := state.UpsertNode(1000, node) 947 if err != nil { 948 t.Fatalf("err: %v", err) 949 } 950 951 // Insert a running alloc on that node 952 alloc := mock.Alloc() 953 alloc.NodeID = node.ID 954 alloc.DesiredStatus = structs.AllocDesiredStatusRun 955 alloc.ClientStatus = structs.AllocClientStatusRunning 956 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 957 if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil { 958 t.Fatalf("err: %v", err) 959 } 960 961 // Update the time tables to make this work 962 tt := s1.fsm.TimeTable() 963 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold)) 964 965 // Create a core scheduler 966 snap, err := state.Snapshot() 967 if err != nil { 968 t.Fatalf("err: %v", err) 969 } 970 core := NewCoreScheduler(s1, snap) 971 972 // Attempt the GC 973 gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000) 974 err = core.Process(gc) 975 if err != nil { 976 t.Fatalf("err: %v", err) 977 } 978 979 // Should still be here 980 ws := memdb.NewWatchSet() 981 out, err := state.NodeByID(ws, node.ID) 982 if err != nil { 983 t.Fatalf("err: %v", err) 984 } 985 if out == nil { 986 t.Fatalf("bad: %v", out) 987 } 988 } 989 990 func TestCoreScheduler_NodeGC_Force(t *testing.T) { 991 t.Parallel() 992 993 s1, cleanupS1 := TestServer(t, nil) 994 defer cleanupS1() 995 testutil.WaitForLeader(t, s1.RPC) 996 997 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 998 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 999 1000 // Insert "dead" node 1001 state := s1.fsm.State() 1002 node := mock.Node() 1003 node.Status = structs.NodeStatusDown 1004 err := state.UpsertNode(1000, node) 1005 if err != nil { 1006 t.Fatalf("err: %v", err) 1007 } 1008 1009 // Create a core scheduler 1010 snap, err := state.Snapshot() 1011 if err != nil { 1012 t.Fatalf("err: %v", err) 1013 } 1014 core := NewCoreScheduler(s1, snap) 1015 1016 // Attempt the GC 1017 gc := s1.coreJobEval(structs.CoreJobForceGC, 1000) 1018 err = core.Process(gc) 1019 if err != nil { 1020 t.Fatalf("err: %v", err) 1021 } 1022 1023 // Should be gone 1024 ws := memdb.NewWatchSet() 1025 out, err := state.NodeByID(ws, node.ID) 1026 if err != nil { 1027 t.Fatalf("err: %v", err) 1028 } 1029 if out != nil { 1030 t.Fatalf("bad: %v", out) 1031 } 1032 } 1033 1034 func TestCoreScheduler_JobGC_OutstandingEvals(t *testing.T) { 1035 t.Parallel() 1036 1037 s1, cleanupS1 := TestServer(t, nil) 1038 defer cleanupS1() 1039 testutil.WaitForLeader(t, s1.RPC) 1040 1041 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1042 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1043 1044 // Insert job. 1045 state := s1.fsm.State() 1046 job := mock.Job() 1047 job.Type = structs.JobTypeBatch 1048 job.Status = structs.JobStatusDead 1049 err := state.UpsertJob(1000, job) 1050 if err != nil { 1051 t.Fatalf("err: %v", err) 1052 } 1053 1054 // Insert two evals, one terminal and one not 1055 eval := mock.Eval() 1056 eval.JobID = job.ID 1057 eval.Status = structs.EvalStatusComplete 1058 1059 eval2 := mock.Eval() 1060 eval2.JobID = job.ID 1061 eval2.Status = structs.EvalStatusPending 1062 err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2}) 1063 if err != nil { 1064 t.Fatalf("err: %v", err) 1065 } 1066 1067 // Update the time tables to make this work 1068 tt := s1.fsm.TimeTable() 1069 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1070 1071 // Create a core scheduler 1072 snap, err := state.Snapshot() 1073 if err != nil { 1074 t.Fatalf("err: %v", err) 1075 } 1076 core := NewCoreScheduler(s1, snap) 1077 1078 // Attempt the GC 1079 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1080 err = core.Process(gc) 1081 if err != nil { 1082 t.Fatalf("err: %v", err) 1083 } 1084 1085 // Should still exist 1086 ws := memdb.NewWatchSet() 1087 out, err := state.JobByID(ws, job.Namespace, job.ID) 1088 if err != nil { 1089 t.Fatalf("err: %v", err) 1090 } 1091 if out == nil { 1092 t.Fatalf("bad: %v", out) 1093 } 1094 1095 outE, err := state.EvalByID(ws, eval.ID) 1096 if err != nil { 1097 t.Fatalf("err: %v", err) 1098 } 1099 if outE == nil { 1100 t.Fatalf("bad: %v", outE) 1101 } 1102 1103 outE2, err := state.EvalByID(ws, eval2.ID) 1104 if err != nil { 1105 t.Fatalf("err: %v", err) 1106 } 1107 if outE2 == nil { 1108 t.Fatalf("bad: %v", outE2) 1109 } 1110 1111 // Update the second eval to be terminal 1112 eval2.Status = structs.EvalStatusComplete 1113 err = state.UpsertEvals(1003, []*structs.Evaluation{eval2}) 1114 if err != nil { 1115 t.Fatalf("err: %v", err) 1116 } 1117 1118 // Create a core scheduler 1119 snap, err = state.Snapshot() 1120 if err != nil { 1121 t.Fatalf("err: %v", err) 1122 } 1123 core = NewCoreScheduler(s1, snap) 1124 1125 // Attempt the GC 1126 gc = s1.coreJobEval(structs.CoreJobJobGC, 2000) 1127 err = core.Process(gc) 1128 if err != nil { 1129 t.Fatalf("err: %v", err) 1130 } 1131 1132 // Should not still exist 1133 out, err = state.JobByID(ws, job.Namespace, job.ID) 1134 if err != nil { 1135 t.Fatalf("err: %v", err) 1136 } 1137 if out != nil { 1138 t.Fatalf("bad: %v", out) 1139 } 1140 1141 outE, err = state.EvalByID(ws, eval.ID) 1142 if err != nil { 1143 t.Fatalf("err: %v", err) 1144 } 1145 if outE != nil { 1146 t.Fatalf("bad: %v", outE) 1147 } 1148 1149 outE2, err = state.EvalByID(ws, eval2.ID) 1150 if err != nil { 1151 t.Fatalf("err: %v", err) 1152 } 1153 if outE2 != nil { 1154 t.Fatalf("bad: %v", outE2) 1155 } 1156 } 1157 1158 func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) { 1159 t.Parallel() 1160 1161 s1, cleanupS1 := TestServer(t, nil) 1162 defer cleanupS1() 1163 testutil.WaitForLeader(t, s1.RPC) 1164 1165 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1166 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1167 1168 // Insert job. 1169 state := s1.fsm.State() 1170 job := mock.Job() 1171 job.Type = structs.JobTypeBatch 1172 job.Status = structs.JobStatusDead 1173 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 1174 Attempts: 0, 1175 Interval: 0 * time.Second, 1176 } 1177 err := state.UpsertJob(1000, job) 1178 if err != nil { 1179 t.Fatalf("err: %v", err) 1180 } 1181 1182 // Insert an eval 1183 eval := mock.Eval() 1184 eval.JobID = job.ID 1185 eval.Status = structs.EvalStatusComplete 1186 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 1187 if err != nil { 1188 t.Fatalf("err: %v", err) 1189 } 1190 1191 // Insert two allocs, one terminal and one not 1192 alloc := mock.Alloc() 1193 alloc.JobID = job.ID 1194 alloc.EvalID = eval.ID 1195 alloc.DesiredStatus = structs.AllocDesiredStatusRun 1196 alloc.ClientStatus = structs.AllocClientStatusComplete 1197 alloc.TaskGroup = job.TaskGroups[0].Name 1198 1199 alloc2 := mock.Alloc() 1200 alloc2.JobID = job.ID 1201 alloc2.EvalID = eval.ID 1202 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 1203 alloc2.ClientStatus = structs.AllocClientStatusRunning 1204 alloc2.TaskGroup = job.TaskGroups[0].Name 1205 1206 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 1207 if err != nil { 1208 t.Fatalf("err: %v", err) 1209 } 1210 1211 // Update the time tables to make this work 1212 tt := s1.fsm.TimeTable() 1213 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1214 1215 // Create a core scheduler 1216 snap, err := state.Snapshot() 1217 if err != nil { 1218 t.Fatalf("err: %v", err) 1219 } 1220 core := NewCoreScheduler(s1, snap) 1221 1222 // Attempt the GC 1223 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1224 err = core.Process(gc) 1225 if err != nil { 1226 t.Fatalf("err: %v", err) 1227 } 1228 1229 // Should still exist 1230 ws := memdb.NewWatchSet() 1231 out, err := state.JobByID(ws, job.Namespace, job.ID) 1232 if err != nil { 1233 t.Fatalf("err: %v", err) 1234 } 1235 if out == nil { 1236 t.Fatalf("bad: %v", out) 1237 } 1238 1239 outA, err := state.AllocByID(ws, alloc.ID) 1240 if err != nil { 1241 t.Fatalf("err: %v", err) 1242 } 1243 if outA == nil { 1244 t.Fatalf("bad: %v", outA) 1245 } 1246 1247 outA2, err := state.AllocByID(ws, alloc2.ID) 1248 if err != nil { 1249 t.Fatalf("err: %v", err) 1250 } 1251 if outA2 == nil { 1252 t.Fatalf("bad: %v", outA2) 1253 } 1254 1255 // Update the second alloc to be terminal 1256 alloc2.ClientStatus = structs.AllocClientStatusComplete 1257 err = state.UpsertAllocs(1003, []*structs.Allocation{alloc2}) 1258 if err != nil { 1259 t.Fatalf("err: %v", err) 1260 } 1261 1262 // Create a core scheduler 1263 snap, err = state.Snapshot() 1264 if err != nil { 1265 t.Fatalf("err: %v", err) 1266 } 1267 core = NewCoreScheduler(s1, snap) 1268 1269 // Attempt the GC 1270 gc = s1.coreJobEval(structs.CoreJobJobGC, 2000) 1271 err = core.Process(gc) 1272 if err != nil { 1273 t.Fatalf("err: %v", err) 1274 } 1275 1276 // Should not still exist 1277 out, err = state.JobByID(ws, job.Namespace, job.ID) 1278 if err != nil { 1279 t.Fatalf("err: %v", err) 1280 } 1281 if out != nil { 1282 t.Fatalf("bad: %v", out) 1283 } 1284 1285 outA, err = state.AllocByID(ws, alloc.ID) 1286 if err != nil { 1287 t.Fatalf("err: %v", err) 1288 } 1289 if outA != nil { 1290 t.Fatalf("bad: %v", outA) 1291 } 1292 1293 outA2, err = state.AllocByID(ws, alloc2.ID) 1294 if err != nil { 1295 t.Fatalf("err: %v", err) 1296 } 1297 if outA2 != nil { 1298 t.Fatalf("bad: %v", outA2) 1299 } 1300 } 1301 1302 // This test ensures that batch jobs are GC'd in one shot, meaning it all 1303 // allocs/evals and job or nothing 1304 func TestCoreScheduler_JobGC_OneShot(t *testing.T) { 1305 t.Parallel() 1306 1307 s1, cleanupS1 := TestServer(t, nil) 1308 defer cleanupS1() 1309 testutil.WaitForLeader(t, s1.RPC) 1310 1311 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1312 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1313 1314 // Insert job. 1315 state := s1.fsm.State() 1316 job := mock.Job() 1317 job.Type = structs.JobTypeBatch 1318 err := state.UpsertJob(1000, job) 1319 if err != nil { 1320 t.Fatalf("err: %v", err) 1321 } 1322 1323 // Insert two complete evals 1324 eval := mock.Eval() 1325 eval.JobID = job.ID 1326 eval.Status = structs.EvalStatusComplete 1327 1328 eval2 := mock.Eval() 1329 eval2.JobID = job.ID 1330 eval2.Status = structs.EvalStatusComplete 1331 1332 err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2}) 1333 if err != nil { 1334 t.Fatalf("err: %v", err) 1335 } 1336 1337 // Insert one complete alloc and one running on distinct evals 1338 alloc := mock.Alloc() 1339 alloc.JobID = job.ID 1340 alloc.EvalID = eval.ID 1341 alloc.DesiredStatus = structs.AllocDesiredStatusStop 1342 1343 alloc2 := mock.Alloc() 1344 alloc2.JobID = job.ID 1345 alloc2.EvalID = eval2.ID 1346 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 1347 1348 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 1349 if err != nil { 1350 t.Fatalf("err: %v", err) 1351 } 1352 1353 // Force the jobs state to dead 1354 job.Status = structs.JobStatusDead 1355 1356 // Update the time tables to make this work 1357 tt := s1.fsm.TimeTable() 1358 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1359 1360 // Create a core scheduler 1361 snap, err := state.Snapshot() 1362 if err != nil { 1363 t.Fatalf("err: %v", err) 1364 } 1365 core := NewCoreScheduler(s1, snap) 1366 1367 // Attempt the GC 1368 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1369 err = core.Process(gc) 1370 if err != nil { 1371 t.Fatalf("err: %v", err) 1372 } 1373 1374 // Should still exist 1375 ws := memdb.NewWatchSet() 1376 out, err := state.JobByID(ws, job.Namespace, job.ID) 1377 if err != nil { 1378 t.Fatalf("err: %v", err) 1379 } 1380 if out == nil { 1381 t.Fatalf("bad: %v", out) 1382 } 1383 1384 outE, err := state.EvalByID(ws, eval.ID) 1385 if err != nil { 1386 t.Fatalf("err: %v", err) 1387 } 1388 if outE == nil { 1389 t.Fatalf("bad: %v", outE) 1390 } 1391 1392 outE2, err := state.EvalByID(ws, eval2.ID) 1393 if err != nil { 1394 t.Fatalf("err: %v", err) 1395 } 1396 if outE2 == nil { 1397 t.Fatalf("bad: %v", outE2) 1398 } 1399 1400 outA, err := state.AllocByID(ws, alloc.ID) 1401 if err != nil { 1402 t.Fatalf("err: %v", err) 1403 } 1404 if outA == nil { 1405 t.Fatalf("bad: %v", outA) 1406 } 1407 outA2, err := state.AllocByID(ws, alloc2.ID) 1408 if err != nil { 1409 t.Fatalf("err: %v", err) 1410 } 1411 if outA2 == nil { 1412 t.Fatalf("bad: %v", outA2) 1413 } 1414 } 1415 1416 // This test ensures that stopped jobs are GCd 1417 func TestCoreScheduler_JobGC_Stopped(t *testing.T) { 1418 t.Parallel() 1419 1420 s1, cleanupS1 := TestServer(t, nil) 1421 defer cleanupS1() 1422 testutil.WaitForLeader(t, s1.RPC) 1423 1424 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1425 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1426 1427 // Insert job. 1428 state := s1.fsm.State() 1429 job := mock.Job() 1430 job.Stop = true 1431 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 1432 Attempts: 0, 1433 Interval: 0 * time.Second, 1434 } 1435 err := state.UpsertJob(1000, job) 1436 if err != nil { 1437 t.Fatalf("err: %v", err) 1438 } 1439 1440 // Insert two complete evals 1441 eval := mock.Eval() 1442 eval.JobID = job.ID 1443 eval.Status = structs.EvalStatusComplete 1444 1445 eval2 := mock.Eval() 1446 eval2.JobID = job.ID 1447 eval2.Status = structs.EvalStatusComplete 1448 1449 err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2}) 1450 if err != nil { 1451 t.Fatalf("err: %v", err) 1452 } 1453 1454 // Insert one complete alloc 1455 alloc := mock.Alloc() 1456 alloc.JobID = job.ID 1457 alloc.EvalID = eval.ID 1458 alloc.DesiredStatus = structs.AllocDesiredStatusStop 1459 alloc.TaskGroup = job.TaskGroups[0].Name 1460 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc}) 1461 if err != nil { 1462 t.Fatalf("err: %v", err) 1463 } 1464 1465 // Update the time tables to make this work 1466 tt := s1.fsm.TimeTable() 1467 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1468 1469 // Create a core scheduler 1470 snap, err := state.Snapshot() 1471 if err != nil { 1472 t.Fatalf("err: %v", err) 1473 } 1474 core := NewCoreScheduler(s1, snap) 1475 1476 // Attempt the GC 1477 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1478 err = core.Process(gc) 1479 if err != nil { 1480 t.Fatalf("err: %v", err) 1481 } 1482 1483 // Shouldn't still exist 1484 ws := memdb.NewWatchSet() 1485 out, err := state.JobByID(ws, job.Namespace, job.ID) 1486 if err != nil { 1487 t.Fatalf("err: %v", err) 1488 } 1489 if out != nil { 1490 t.Fatalf("bad: %v", out) 1491 } 1492 1493 outE, err := state.EvalByID(ws, eval.ID) 1494 if err != nil { 1495 t.Fatalf("err: %v", err) 1496 } 1497 if outE != nil { 1498 t.Fatalf("bad: %v", outE) 1499 } 1500 1501 outE2, err := state.EvalByID(ws, eval2.ID) 1502 if err != nil { 1503 t.Fatalf("err: %v", err) 1504 } 1505 if outE2 != nil { 1506 t.Fatalf("bad: %v", outE2) 1507 } 1508 1509 outA, err := state.AllocByID(ws, alloc.ID) 1510 if err != nil { 1511 t.Fatalf("err: %v", err) 1512 } 1513 if outA != nil { 1514 t.Fatalf("bad: %v", outA) 1515 } 1516 } 1517 1518 func TestCoreScheduler_JobGC_Force(t *testing.T) { 1519 t.Parallel() 1520 for _, withAcl := range []bool{false, true} { 1521 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 1522 var server *Server 1523 var cleanup func() 1524 if withAcl { 1525 server, _, cleanup = TestACLServer(t, nil) 1526 } else { 1527 server, cleanup = TestServer(t, nil) 1528 } 1529 defer cleanup() 1530 testutil.WaitForLeader(t, server.RPC) 1531 1532 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1533 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1534 1535 // Insert job. 1536 state := server.fsm.State() 1537 job := mock.Job() 1538 job.Type = structs.JobTypeBatch 1539 job.Status = structs.JobStatusDead 1540 err := state.UpsertJob(1000, job) 1541 if err != nil { 1542 t.Fatalf("err: %v", err) 1543 } 1544 1545 // Insert a terminal eval 1546 eval := mock.Eval() 1547 eval.JobID = job.ID 1548 eval.Status = structs.EvalStatusComplete 1549 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 1550 if err != nil { 1551 t.Fatalf("err: %v", err) 1552 } 1553 1554 // Create a core scheduler 1555 snap, err := state.Snapshot() 1556 if err != nil { 1557 t.Fatalf("err: %v", err) 1558 } 1559 core := NewCoreScheduler(server, snap) 1560 1561 // Attempt the GC 1562 gc := server.coreJobEval(structs.CoreJobForceGC, 1002) 1563 err = core.Process(gc) 1564 if err != nil { 1565 t.Fatalf("err: %v", err) 1566 } 1567 1568 // Shouldn't still exist 1569 ws := memdb.NewWatchSet() 1570 out, err := state.JobByID(ws, job.Namespace, job.ID) 1571 if err != nil { 1572 t.Fatalf("err: %v", err) 1573 } 1574 if out != nil { 1575 t.Fatalf("bad: %v", out) 1576 } 1577 1578 outE, err := state.EvalByID(ws, eval.ID) 1579 if err != nil { 1580 t.Fatalf("err: %v", err) 1581 } 1582 if outE != nil { 1583 t.Fatalf("bad: %v", outE) 1584 } 1585 }) 1586 } 1587 } 1588 1589 // This test ensures parameterized jobs only get gc'd when stopped 1590 func TestCoreScheduler_JobGC_Parameterized(t *testing.T) { 1591 t.Parallel() 1592 1593 s1, cleanupS1 := TestServer(t, nil) 1594 defer cleanupS1() 1595 testutil.WaitForLeader(t, s1.RPC) 1596 1597 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1598 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1599 1600 // Insert a parameterized job. 1601 state := s1.fsm.State() 1602 job := mock.Job() 1603 job.Type = structs.JobTypeBatch 1604 job.Status = structs.JobStatusRunning 1605 job.ParameterizedJob = &structs.ParameterizedJobConfig{ 1606 Payload: structs.DispatchPayloadRequired, 1607 } 1608 err := state.UpsertJob(1000, job) 1609 if err != nil { 1610 t.Fatalf("err: %v", err) 1611 } 1612 1613 // Create a core scheduler 1614 snap, err := state.Snapshot() 1615 if err != nil { 1616 t.Fatalf("err: %v", err) 1617 } 1618 core := NewCoreScheduler(s1, snap) 1619 1620 // Attempt the GC 1621 gc := s1.coreJobEval(structs.CoreJobForceGC, 1002) 1622 err = core.Process(gc) 1623 if err != nil { 1624 t.Fatalf("err: %v", err) 1625 } 1626 1627 // Should still exist 1628 ws := memdb.NewWatchSet() 1629 out, err := state.JobByID(ws, job.Namespace, job.ID) 1630 if err != nil { 1631 t.Fatalf("err: %v", err) 1632 } 1633 if out == nil { 1634 t.Fatalf("bad: %v", out) 1635 } 1636 1637 // Mark the job as stopped and try again 1638 job2 := job.Copy() 1639 job2.Stop = true 1640 err = state.UpsertJob(2000, job2) 1641 if err != nil { 1642 t.Fatalf("err: %v", err) 1643 } 1644 1645 // Create a core scheduler 1646 snap, err = state.Snapshot() 1647 if err != nil { 1648 t.Fatalf("err: %v", err) 1649 } 1650 core = NewCoreScheduler(s1, snap) 1651 1652 // Attempt the GC 1653 gc = s1.coreJobEval(structs.CoreJobForceGC, 2002) 1654 err = core.Process(gc) 1655 if err != nil { 1656 t.Fatalf("err: %v", err) 1657 } 1658 1659 // Should not exist 1660 out, err = state.JobByID(ws, job.Namespace, job.ID) 1661 if err != nil { 1662 t.Fatalf("err: %v", err) 1663 } 1664 if out != nil { 1665 t.Fatalf("bad: %+v", out) 1666 } 1667 } 1668 1669 // This test ensures periodic jobs don't get GCd until they are stopped 1670 func TestCoreScheduler_JobGC_Periodic(t *testing.T) { 1671 t.Parallel() 1672 1673 s1, cleanupS1 := TestServer(t, nil) 1674 defer cleanupS1() 1675 testutil.WaitForLeader(t, s1.RPC) 1676 1677 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1678 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1679 1680 // Insert a parameterized job. 1681 state := s1.fsm.State() 1682 job := mock.PeriodicJob() 1683 err := state.UpsertJob(1000, job) 1684 if err != nil { 1685 t.Fatalf("err: %v", err) 1686 } 1687 1688 // Create a core scheduler 1689 snap, err := state.Snapshot() 1690 if err != nil { 1691 t.Fatalf("err: %v", err) 1692 } 1693 core := NewCoreScheduler(s1, snap) 1694 1695 // Attempt the GC 1696 gc := s1.coreJobEval(structs.CoreJobForceGC, 1002) 1697 err = core.Process(gc) 1698 if err != nil { 1699 t.Fatalf("err: %v", err) 1700 } 1701 1702 // Should still exist 1703 ws := memdb.NewWatchSet() 1704 out, err := state.JobByID(ws, job.Namespace, job.ID) 1705 if err != nil { 1706 t.Fatalf("err: %v", err) 1707 } 1708 if out == nil { 1709 t.Fatalf("bad: %v", out) 1710 } 1711 1712 // Mark the job as stopped and try again 1713 job2 := job.Copy() 1714 job2.Stop = true 1715 err = state.UpsertJob(2000, job2) 1716 if err != nil { 1717 t.Fatalf("err: %v", err) 1718 } 1719 1720 // Create a core scheduler 1721 snap, err = state.Snapshot() 1722 if err != nil { 1723 t.Fatalf("err: %v", err) 1724 } 1725 core = NewCoreScheduler(s1, snap) 1726 1727 // Attempt the GC 1728 gc = s1.coreJobEval(structs.CoreJobForceGC, 2002) 1729 err = core.Process(gc) 1730 if err != nil { 1731 t.Fatalf("err: %v", err) 1732 } 1733 1734 // Should not exist 1735 out, err = state.JobByID(ws, job.Namespace, job.ID) 1736 if err != nil { 1737 t.Fatalf("err: %v", err) 1738 } 1739 if out != nil { 1740 t.Fatalf("bad: %+v", out) 1741 } 1742 } 1743 1744 func TestCoreScheduler_DeploymentGC(t *testing.T) { 1745 t.Parallel() 1746 1747 s1, cleanupS1 := TestServer(t, nil) 1748 defer cleanupS1() 1749 testutil.WaitForLeader(t, s1.RPC) 1750 assert := assert.New(t) 1751 1752 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1753 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1754 1755 // Insert an active, terminal, and terminal with allocations deployment 1756 state := s1.fsm.State() 1757 d1, d2, d3 := mock.Deployment(), mock.Deployment(), mock.Deployment() 1758 d1.Status = structs.DeploymentStatusFailed 1759 d3.Status = structs.DeploymentStatusSuccessful 1760 assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment") 1761 assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment") 1762 assert.Nil(state.UpsertDeployment(1002, d3), "UpsertDeployment") 1763 1764 a := mock.Alloc() 1765 a.JobID = d3.JobID 1766 a.DeploymentID = d3.ID 1767 assert.Nil(state.UpsertAllocs(1003, []*structs.Allocation{a}), "UpsertAllocs") 1768 1769 // Update the time tables to make this work 1770 tt := s1.fsm.TimeTable() 1771 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.DeploymentGCThreshold)) 1772 1773 // Create a core scheduler 1774 snap, err := state.Snapshot() 1775 assert.Nil(err, "Snapshot") 1776 core := NewCoreScheduler(s1, snap) 1777 1778 // Attempt the GC 1779 gc := s1.coreJobEval(structs.CoreJobDeploymentGC, 2000) 1780 assert.Nil(core.Process(gc), "Process GC") 1781 1782 // Should be gone 1783 ws := memdb.NewWatchSet() 1784 out, err := state.DeploymentByID(ws, d1.ID) 1785 assert.Nil(err, "DeploymentByID") 1786 assert.Nil(out, "Terminal Deployment") 1787 out2, err := state.DeploymentByID(ws, d2.ID) 1788 assert.Nil(err, "DeploymentByID") 1789 assert.NotNil(out2, "Active Deployment") 1790 out3, err := state.DeploymentByID(ws, d3.ID) 1791 assert.Nil(err, "DeploymentByID") 1792 assert.NotNil(out3, "Terminal Deployment With Allocs") 1793 } 1794 1795 func TestCoreScheduler_DeploymentGC_Force(t *testing.T) { 1796 t.Parallel() 1797 for _, withAcl := range []bool{false, true} { 1798 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 1799 var server *Server 1800 var cleanup func() 1801 if withAcl { 1802 server, _, cleanup = TestACLServer(t, nil) 1803 } else { 1804 server, cleanup = TestServer(t, nil) 1805 } 1806 defer cleanup() 1807 testutil.WaitForLeader(t, server.RPC) 1808 assert := assert.New(t) 1809 1810 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1811 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1812 1813 // Insert terminal and active deployment 1814 state := server.fsm.State() 1815 d1, d2 := mock.Deployment(), mock.Deployment() 1816 d1.Status = structs.DeploymentStatusFailed 1817 assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment") 1818 assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment") 1819 1820 // Create a core scheduler 1821 snap, err := state.Snapshot() 1822 assert.Nil(err, "Snapshot") 1823 core := NewCoreScheduler(server, snap) 1824 1825 // Attempt the GC 1826 gc := server.coreJobEval(structs.CoreJobForceGC, 1000) 1827 assert.Nil(core.Process(gc), "Process Force GC") 1828 1829 // Should be gone 1830 ws := memdb.NewWatchSet() 1831 out, err := state.DeploymentByID(ws, d1.ID) 1832 assert.Nil(err, "DeploymentByID") 1833 assert.Nil(out, "Terminal Deployment") 1834 out2, err := state.DeploymentByID(ws, d2.ID) 1835 assert.Nil(err, "DeploymentByID") 1836 assert.NotNil(out2, "Active Deployment") 1837 }) 1838 } 1839 } 1840 1841 func TestCoreScheduler_PartitionEvalReap(t *testing.T) { 1842 t.Parallel() 1843 1844 s1, cleanupS1 := TestServer(t, nil) 1845 defer cleanupS1() 1846 testutil.WaitForLeader(t, s1.RPC) 1847 1848 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1849 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1850 1851 // Create a core scheduler 1852 snap, err := s1.fsm.State().Snapshot() 1853 if err != nil { 1854 t.Fatalf("err: %v", err) 1855 } 1856 core := NewCoreScheduler(s1, snap) 1857 1858 // Set the max ids per reap to something lower. 1859 maxIdsPerReap = 2 1860 1861 evals := []string{"a", "b", "c"} 1862 allocs := []string{"1", "2", "3"} 1863 requests := core.(*CoreScheduler).partitionEvalReap(evals, allocs) 1864 if len(requests) != 3 { 1865 t.Fatalf("Expected 3 requests got: %v", requests) 1866 } 1867 1868 first := requests[0] 1869 if len(first.Allocs) != 2 && len(first.Evals) != 0 { 1870 t.Fatalf("Unexpected first request: %v", first) 1871 } 1872 1873 second := requests[1] 1874 if len(second.Allocs) != 1 && len(second.Evals) != 1 { 1875 t.Fatalf("Unexpected second request: %v", second) 1876 } 1877 1878 third := requests[2] 1879 if len(third.Allocs) != 0 && len(third.Evals) != 2 { 1880 t.Fatalf("Unexpected third request: %v", third) 1881 } 1882 } 1883 1884 func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) { 1885 t.Parallel() 1886 1887 s1, cleanupS1 := TestServer(t, nil) 1888 defer cleanupS1() 1889 testutil.WaitForLeader(t, s1.RPC) 1890 1891 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1892 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1893 1894 // Create a core scheduler 1895 snap, err := s1.fsm.State().Snapshot() 1896 if err != nil { 1897 t.Fatalf("err: %v", err) 1898 } 1899 core := NewCoreScheduler(s1, snap) 1900 1901 // Set the max ids per reap to something lower. 1902 maxIdsPerReap = 2 1903 1904 deployments := []string{"a", "b", "c"} 1905 requests := core.(*CoreScheduler).partitionDeploymentReap(deployments) 1906 if len(requests) != 2 { 1907 t.Fatalf("Expected 2 requests got: %v", requests) 1908 } 1909 1910 first := requests[0] 1911 if len(first.Deployments) != 2 { 1912 t.Fatalf("Unexpected first request: %v", first) 1913 } 1914 1915 second := requests[1] 1916 if len(second.Deployments) != 1 { 1917 t.Fatalf("Unexpected second request: %v", second) 1918 } 1919 } 1920 1921 func TestCoreScheduler_PartitionJobReap(t *testing.T) { 1922 t.Parallel() 1923 require := require.New(t) 1924 1925 s1, cleanupS1 := TestServer(t, nil) 1926 defer cleanupS1() 1927 testutil.WaitForLeader(t, s1.RPC) 1928 1929 // Create a core scheduler 1930 snap, err := s1.fsm.State().Snapshot() 1931 if err != nil { 1932 t.Fatalf("err: %v", err) 1933 } 1934 core := NewCoreScheduler(s1, snap) 1935 1936 // Set the max ids per reap to something lower. 1937 maxIdsPerReap = 2 1938 1939 jobs := []*structs.Job{mock.Job(), mock.Job(), mock.Job()} 1940 requests := core.(*CoreScheduler).partitionJobReap(jobs, "") 1941 require.Len(requests, 2) 1942 1943 first := requests[0] 1944 second := requests[1] 1945 require.Len(first.Jobs, 2) 1946 require.Len(second.Jobs, 1) 1947 } 1948 1949 // Tests various scenarios when allocations are eligible to be GCed 1950 func TestAllocation_GCEligible(t *testing.T) { 1951 type testCase struct { 1952 Desc string 1953 GCTime time.Time 1954 ClientStatus string 1955 DesiredStatus string 1956 JobStatus string 1957 JobStop bool 1958 AllocJobModifyIndex uint64 1959 JobModifyIndex uint64 1960 ModifyIndex uint64 1961 NextAllocID string 1962 ReschedulePolicy *structs.ReschedulePolicy 1963 RescheduleTrackers []*structs.RescheduleEvent 1964 ThresholdIndex uint64 1965 ShouldGC bool 1966 } 1967 1968 fail := time.Now() 1969 1970 harness := []testCase{ 1971 { 1972 Desc: "Don't GC when non terminal", 1973 ClientStatus: structs.AllocClientStatusPending, 1974 DesiredStatus: structs.AllocDesiredStatusRun, 1975 GCTime: fail, 1976 ModifyIndex: 90, 1977 ThresholdIndex: 90, 1978 ShouldGC: false, 1979 }, 1980 { 1981 Desc: "Don't GC when non terminal and job stopped", 1982 ClientStatus: structs.AllocClientStatusPending, 1983 DesiredStatus: structs.AllocDesiredStatusRun, 1984 JobStop: true, 1985 GCTime: fail, 1986 ModifyIndex: 90, 1987 ThresholdIndex: 90, 1988 ShouldGC: false, 1989 }, 1990 { 1991 Desc: "Don't GC when non terminal and job dead", 1992 ClientStatus: structs.AllocClientStatusPending, 1993 DesiredStatus: structs.AllocDesiredStatusRun, 1994 JobStatus: structs.JobStatusDead, 1995 GCTime: fail, 1996 ModifyIndex: 90, 1997 ThresholdIndex: 90, 1998 ShouldGC: false, 1999 }, 2000 { 2001 Desc: "Don't GC when non terminal on client and job dead", 2002 ClientStatus: structs.AllocClientStatusRunning, 2003 DesiredStatus: structs.AllocDesiredStatusStop, 2004 JobStatus: structs.JobStatusDead, 2005 GCTime: fail, 2006 ModifyIndex: 90, 2007 ThresholdIndex: 90, 2008 ShouldGC: false, 2009 }, 2010 { 2011 Desc: "GC when terminal but not failed ", 2012 ClientStatus: structs.AllocClientStatusComplete, 2013 DesiredStatus: structs.AllocDesiredStatusRun, 2014 GCTime: fail, 2015 ModifyIndex: 90, 2016 ThresholdIndex: 90, 2017 ReschedulePolicy: nil, 2018 ShouldGC: true, 2019 }, 2020 { 2021 Desc: "Don't GC when threshold not met", 2022 ClientStatus: structs.AllocClientStatusComplete, 2023 DesiredStatus: structs.AllocDesiredStatusStop, 2024 GCTime: fail, 2025 ModifyIndex: 100, 2026 ThresholdIndex: 90, 2027 ReschedulePolicy: nil, 2028 ShouldGC: false, 2029 }, 2030 { 2031 Desc: "GC when no reschedule policy", 2032 ClientStatus: structs.AllocClientStatusFailed, 2033 DesiredStatus: structs.AllocDesiredStatusRun, 2034 GCTime: fail, 2035 ReschedulePolicy: nil, 2036 ModifyIndex: 90, 2037 ThresholdIndex: 90, 2038 ShouldGC: true, 2039 }, 2040 { 2041 Desc: "GC when empty policy", 2042 ClientStatus: structs.AllocClientStatusFailed, 2043 DesiredStatus: structs.AllocDesiredStatusRun, 2044 GCTime: fail, 2045 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 0, Interval: 0 * time.Minute}, 2046 ModifyIndex: 90, 2047 ThresholdIndex: 90, 2048 ShouldGC: true, 2049 }, 2050 { 2051 Desc: "Don't GC when no previous reschedule attempts", 2052 ClientStatus: structs.AllocClientStatusFailed, 2053 DesiredStatus: structs.AllocDesiredStatusRun, 2054 GCTime: fail, 2055 ModifyIndex: 90, 2056 ThresholdIndex: 90, 2057 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 1, Interval: 1 * time.Minute}, 2058 ShouldGC: false, 2059 }, 2060 { 2061 Desc: "Don't GC when prev reschedule attempt within interval", 2062 ClientStatus: structs.AllocClientStatusFailed, 2063 DesiredStatus: structs.AllocDesiredStatusRun, 2064 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute}, 2065 GCTime: fail, 2066 ModifyIndex: 90, 2067 ThresholdIndex: 90, 2068 RescheduleTrackers: []*structs.RescheduleEvent{ 2069 { 2070 RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(), 2071 }, 2072 }, 2073 ShouldGC: false, 2074 }, 2075 { 2076 Desc: "GC with prev reschedule attempt outside interval", 2077 ClientStatus: structs.AllocClientStatusFailed, 2078 DesiredStatus: structs.AllocDesiredStatusRun, 2079 GCTime: fail, 2080 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 2081 RescheduleTrackers: []*structs.RescheduleEvent{ 2082 { 2083 RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(), 2084 }, 2085 { 2086 RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(), 2087 }, 2088 }, 2089 ShouldGC: true, 2090 }, 2091 { 2092 Desc: "GC when next alloc id is set", 2093 ClientStatus: structs.AllocClientStatusFailed, 2094 DesiredStatus: structs.AllocDesiredStatusRun, 2095 GCTime: fail, 2096 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 2097 RescheduleTrackers: []*structs.RescheduleEvent{ 2098 { 2099 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 2100 }, 2101 }, 2102 NextAllocID: uuid.Generate(), 2103 ShouldGC: true, 2104 }, 2105 { 2106 Desc: "Don't GC when next alloc id is not set and unlimited restarts", 2107 ClientStatus: structs.AllocClientStatusFailed, 2108 DesiredStatus: structs.AllocDesiredStatusRun, 2109 GCTime: fail, 2110 ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"}, 2111 RescheduleTrackers: []*structs.RescheduleEvent{ 2112 { 2113 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 2114 }, 2115 }, 2116 ShouldGC: false, 2117 }, 2118 { 2119 Desc: "GC when job is stopped", 2120 ClientStatus: structs.AllocClientStatusFailed, 2121 DesiredStatus: structs.AllocDesiredStatusRun, 2122 GCTime: fail, 2123 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 2124 RescheduleTrackers: []*structs.RescheduleEvent{ 2125 { 2126 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 2127 }, 2128 }, 2129 JobStop: true, 2130 ShouldGC: true, 2131 }, 2132 { 2133 Desc: "GC when job status is dead", 2134 ClientStatus: structs.AllocClientStatusFailed, 2135 DesiredStatus: structs.AllocDesiredStatusRun, 2136 GCTime: fail, 2137 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 2138 RescheduleTrackers: []*structs.RescheduleEvent{ 2139 { 2140 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 2141 }, 2142 }, 2143 JobStatus: structs.JobStatusDead, 2144 ShouldGC: true, 2145 }, 2146 { 2147 Desc: "GC when desired status is stop, unlimited reschedule policy, no previous reschedule events", 2148 ClientStatus: structs.AllocClientStatusFailed, 2149 DesiredStatus: structs.AllocDesiredStatusStop, 2150 GCTime: fail, 2151 ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"}, 2152 ShouldGC: true, 2153 }, 2154 { 2155 Desc: "GC when desired status is stop, limited reschedule policy, some previous reschedule events", 2156 ClientStatus: structs.AllocClientStatusFailed, 2157 DesiredStatus: structs.AllocDesiredStatusStop, 2158 GCTime: fail, 2159 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 2160 RescheduleTrackers: []*structs.RescheduleEvent{ 2161 { 2162 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 2163 }, 2164 }, 2165 ShouldGC: true, 2166 }, 2167 } 2168 2169 for _, tc := range harness { 2170 alloc := &structs.Allocation{} 2171 alloc.ModifyIndex = tc.ModifyIndex 2172 alloc.DesiredStatus = tc.DesiredStatus 2173 alloc.ClientStatus = tc.ClientStatus 2174 alloc.RescheduleTracker = &structs.RescheduleTracker{Events: tc.RescheduleTrackers} 2175 alloc.NextAllocation = tc.NextAllocID 2176 job := mock.Job() 2177 alloc.TaskGroup = job.TaskGroups[0].Name 2178 job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy 2179 if tc.JobStatus != "" { 2180 job.Status = tc.JobStatus 2181 } 2182 job.Stop = tc.JobStop 2183 2184 t.Run(tc.Desc, func(t *testing.T) { 2185 if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC { 2186 t.Fatalf("expected %v but got %v", tc.ShouldGC, got) 2187 } 2188 }) 2189 2190 } 2191 2192 // Verify nil job 2193 require := require.New(t) 2194 alloc := mock.Alloc() 2195 alloc.ClientStatus = structs.AllocClientStatusComplete 2196 require.True(allocGCEligible(alloc, nil, time.Now(), 1000)) 2197 } 2198 2199 func TestCoreScheduler_CSIPluginGC(t *testing.T) { 2200 t.Parallel() 2201 2202 srv, cleanupSRV := TestServer(t, nil) 2203 defer cleanupSRV() 2204 testutil.WaitForLeader(t, srv.RPC) 2205 require := require.New(t) 2206 2207 srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 2208 2209 deleteNodes := state.CreateTestCSIPlugin(srv.fsm.State(), "foo") 2210 defer deleteNodes() 2211 state := srv.fsm.State() 2212 2213 // Update the time tables to make this work 2214 tt := srv.fsm.TimeTable() 2215 index := uint64(2000) 2216 tt.Witness(index, time.Now().UTC().Add(-1*srv.config.CSIPluginGCThreshold)) 2217 2218 // Create a core scheduler 2219 snap, err := state.Snapshot() 2220 require.NoError(err) 2221 core := NewCoreScheduler(srv, snap) 2222 2223 // Attempt the GC 2224 index++ 2225 gc := srv.coreJobEval(structs.CoreJobCSIPluginGC, index) 2226 require.NoError(core.Process(gc)) 2227 2228 // Should not be gone (plugin in use) 2229 ws := memdb.NewWatchSet() 2230 plug, err := state.CSIPluginByID(ws, "foo") 2231 require.NotNil(plug) 2232 require.NoError(err) 2233 2234 // Empty the plugin 2235 plug.Controllers = map[string]*structs.CSIInfo{} 2236 plug.Nodes = map[string]*structs.CSIInfo{} 2237 2238 index++ 2239 err = state.UpsertCSIPlugin(index, plug) 2240 require.NoError(err) 2241 2242 // Retry 2243 index++ 2244 gc = srv.coreJobEval(structs.CoreJobCSIPluginGC, index) 2245 require.NoError(core.Process(gc)) 2246 2247 // Should be gone 2248 plug, err = state.CSIPluginByID(ws, "foo") 2249 require.Nil(plug) 2250 require.NoError(err) 2251 } 2252 2253 func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) { 2254 t.Parallel() 2255 require := require.New(t) 2256 2257 srv, shutdown := TestServer(t, func(c *Config) { 2258 c.NumSchedulers = 0 // Prevent automatic dequeue 2259 }) 2260 2261 defer shutdown() 2262 testutil.WaitForLeader(t, srv.RPC) 2263 codec := rpcClient(t, srv) 2264 2265 index := uint64(1) 2266 volID := uuid.Generate() 2267 ns := structs.DefaultNamespace 2268 pluginID := "foo" 2269 2270 state := srv.fsm.State() 2271 ws := memdb.NewWatchSet() 2272 2273 index, _ = state.LatestIndex() 2274 2275 // Create client node and plugin 2276 node := mock.Node() 2277 node.Attributes["nomad.version"] = "0.11.0" // needs client RPCs 2278 node.CSINodePlugins = map[string]*structs.CSIInfo{ 2279 pluginID: { 2280 PluginID: pluginID, 2281 Healthy: true, 2282 NodeInfo: &structs.CSINodeInfo{}, 2283 }, 2284 } 2285 index++ 2286 err := state.UpsertNode(index, node) 2287 require.NoError(err) 2288 2289 // Note that for volume writes in this test we need to use the 2290 // RPCs rather than StateStore methods directly so that the GC 2291 // job's RPC call updates a later index. otherwise the 2292 // volumewatcher won't trigger for the final GC 2293 2294 // Register a volume 2295 vols := []*structs.CSIVolume{{ 2296 ID: volID, 2297 Namespace: ns, 2298 PluginID: pluginID, 2299 AccessMode: structs.CSIVolumeAccessModeMultiNodeSingleWriter, 2300 AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem, 2301 Topologies: []*structs.CSITopology{}, 2302 }} 2303 volReq := &structs.CSIVolumeRegisterRequest{Volumes: vols} 2304 volReq.Namespace = ns 2305 volReq.Region = srv.config.Region 2306 2307 err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Register", 2308 volReq, &structs.CSIVolumeRegisterResponse{}) 2309 require.NoError(err) 2310 2311 // Create a job with two allocations that claim the volume. 2312 // We use two allocs here, one of which is not running, so 2313 // that we can assert that the volumewatcher has made one 2314 // complete pass (and removed the 2nd alloc) before running 2315 // the GC. 2316 eval := mock.Eval() 2317 eval.Status = structs.EvalStatusFailed 2318 index++ 2319 state.UpsertJobSummary(index, mock.JobSummary(eval.JobID)) 2320 index++ 2321 err = state.UpsertEvals(index, []*structs.Evaluation{eval}) 2322 require.Nil(err) 2323 2324 job := mock.Job() 2325 job.ID = eval.JobID 2326 job.Status = structs.JobStatusRunning 2327 index++ 2328 err = state.UpsertJob(index, job) 2329 require.NoError(err) 2330 2331 alloc1, alloc2 := mock.Alloc(), mock.Alloc() 2332 alloc1.NodeID = node.ID 2333 alloc1.ClientStatus = structs.AllocClientStatusRunning 2334 alloc1.Job = job 2335 alloc1.JobID = job.ID 2336 alloc1.EvalID = eval.ID 2337 2338 alloc2.NodeID = node.ID 2339 alloc2.ClientStatus = structs.AllocClientStatusComplete 2340 alloc2.Job = job 2341 alloc2.JobID = job.ID 2342 alloc2.EvalID = eval.ID 2343 2344 summary := mock.JobSummary(alloc1.JobID) 2345 index++ 2346 require.NoError(state.UpsertJobSummary(index, summary)) 2347 summary = mock.JobSummary(alloc2.JobID) 2348 index++ 2349 require.NoError(state.UpsertJobSummary(index, summary)) 2350 index++ 2351 require.NoError(state.UpsertAllocs(index, 2352 []*structs.Allocation{alloc1, alloc2})) 2353 2354 // Claim the volume for the alloc 2355 req := &structs.CSIVolumeClaimRequest{ 2356 AllocationID: alloc1.ID, 2357 NodeID: node.ID, 2358 VolumeID: volID, 2359 Claim: structs.CSIVolumeClaimWrite, 2360 } 2361 req.Namespace = ns 2362 req.Region = srv.config.Region 2363 err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Claim", 2364 req, &structs.CSIVolumeClaimResponse{}) 2365 require.NoError(err) 2366 2367 // ready-to-free claim; once it's gone we know the volumewatcher 2368 // has run once and stopped 2369 req.AllocationID = alloc2.ID 2370 req.Claim = structs.CSIVolumeClaimRelease 2371 req.State = structs.CSIVolumeClaimStateControllerDetached 2372 err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Claim", 2373 req, &structs.CSIVolumeClaimResponse{}) 2374 require.NoError(err) 2375 2376 // wait for volumewatcher 2377 var vol *structs.CSIVolume 2378 require.Eventually(func() bool { 2379 vol, _ = state.CSIVolumeByID(ws, ns, volID) 2380 return len(vol.ReadAllocs) == 0 && 2381 len(vol.ReadClaims) == 0 && 2382 len(vol.PastClaims) == 0 2383 }, time.Second*1, 10*time.Millisecond, "stale claim was not released") 2384 2385 // Delete allocation and job 2386 index++ 2387 err = state.DeleteJob(index, ns, job.ID) 2388 require.NoError(err) 2389 index++ 2390 err = state.DeleteEval(index, []string{eval.ID}, []string{alloc1.ID, alloc2.ID}) 2391 require.NoError(err) 2392 2393 // Create a core scheduler and attempt the volume claim GC 2394 snap, err := state.Snapshot() 2395 require.NoError(err) 2396 core := NewCoreScheduler(srv, snap) 2397 2398 index++ 2399 gc := srv.coreJobEval(structs.CoreJobForceGC, index) 2400 c := core.(*CoreScheduler) 2401 require.NoError(c.csiVolumeClaimGC(gc)) 2402 2403 // the volumewatcher will hit an error here because there's no 2404 // path to the node. but we can't update the claim to bypass the 2405 // client RPCs without triggering the volumewatcher's normal code 2406 // path. 2407 require.Eventually(func() bool { 2408 vol, _ = state.CSIVolumeByID(ws, ns, volID) 2409 return len(vol.WriteClaims) == 1 && 2410 len(vol.WriteAllocs) == 1 && 2411 len(vol.PastClaims) == 0 2412 }, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly") 2413 2414 req.AllocationID = alloc1.ID 2415 err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Claim", 2416 req, &structs.CSIVolumeClaimResponse{}) 2417 require.NoError(err) 2418 2419 // wait for volumewatcher 2420 require.Eventually(func() bool { 2421 vol, _ = state.CSIVolumeByID(ws, ns, volID) 2422 return len(vol.WriteClaims) == 0 && 2423 len(vol.WriteAllocs) == 0 && 2424 len(vol.PastClaims) == 0 2425 }, time.Second*1, 10*time.Millisecond, "claims were not released") 2426 2427 }