github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/nomad/core_sched_test.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "testing" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/helper/uuid" 10 "github.com/hashicorp/nomad/nomad/mock" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "github.com/hashicorp/nomad/testutil" 13 "github.com/stretchr/testify/assert" 14 "github.com/stretchr/testify/require" 15 ) 16 17 func TestCoreScheduler_EvalGC(t *testing.T) { 18 t.Parallel() 19 s1 := TestServer(t, nil) 20 defer s1.Shutdown() 21 testutil.WaitForLeader(t, s1.RPC) 22 require := require.New(t) 23 24 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 25 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 26 27 // Insert "dead" eval 28 state := s1.fsm.State() 29 eval := mock.Eval() 30 eval.Status = structs.EvalStatusFailed 31 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 32 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 33 require.Nil(err) 34 35 // Insert mock job with rescheduling disabled 36 job := mock.Job() 37 job.ID = eval.JobID 38 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 39 Attempts: 0, 40 Interval: 0 * time.Second, 41 } 42 err = state.UpsertJob(1001, job) 43 require.Nil(err) 44 45 // Insert "dead" alloc 46 alloc := mock.Alloc() 47 alloc.EvalID = eval.ID 48 alloc.DesiredStatus = structs.AllocDesiredStatusStop 49 alloc.JobID = eval.JobID 50 alloc.TaskGroup = job.TaskGroups[0].Name 51 52 // Insert "lost" alloc 53 alloc2 := mock.Alloc() 54 alloc2.EvalID = eval.ID 55 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 56 alloc2.ClientStatus = structs.AllocClientStatusLost 57 alloc2.JobID = eval.JobID 58 alloc2.TaskGroup = job.TaskGroups[0].Name 59 err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) 60 if err != nil { 61 t.Fatalf("err: %v", err) 62 } 63 64 // Update the time tables to make this work 65 tt := s1.fsm.TimeTable() 66 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 67 68 // Create a core scheduler 69 snap, err := state.Snapshot() 70 if err != nil { 71 t.Fatalf("err: %v", err) 72 } 73 core := NewCoreScheduler(s1, snap) 74 75 // Attempt the GC 76 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 77 err = core.Process(gc) 78 if err != nil { 79 t.Fatalf("err: %v", err) 80 } 81 82 // Should be gone 83 ws := memdb.NewWatchSet() 84 out, err := state.EvalByID(ws, eval.ID) 85 if err != nil { 86 t.Fatalf("err: %v", err) 87 } 88 if out != nil { 89 t.Fatalf("bad: %v", out) 90 } 91 92 outA, err := state.AllocByID(ws, alloc.ID) 93 if err != nil { 94 t.Fatalf("err: %v", err) 95 } 96 if outA != nil { 97 t.Fatalf("bad: %v", outA) 98 } 99 100 outA2, err := state.AllocByID(ws, alloc2.ID) 101 if err != nil { 102 t.Fatalf("err: %v", err) 103 } 104 if outA2 != nil { 105 t.Fatalf("bad: %v", outA2) 106 } 107 } 108 109 // Tests GC behavior on allocations being rescheduled 110 func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) { 111 t.Parallel() 112 s1 := TestServer(t, nil) 113 defer s1.Shutdown() 114 testutil.WaitForLeader(t, s1.RPC) 115 require := require.New(t) 116 117 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 118 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 119 120 // Insert "dead" eval 121 state := s1.fsm.State() 122 eval := mock.Eval() 123 eval.Status = structs.EvalStatusFailed 124 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 125 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 126 require.Nil(err) 127 128 // Insert "pending" eval for same job 129 eval2 := mock.Eval() 130 eval2.JobID = eval.JobID 131 state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID)) 132 err = state.UpsertEvals(1003, []*structs.Evaluation{eval2}) 133 require.Nil(err) 134 135 // Insert mock job with default reschedule policy of 2 in 10 minutes 136 job := mock.Job() 137 job.ID = eval.JobID 138 139 err = state.UpsertJob(1001, job) 140 require.Nil(err) 141 142 // Insert failed alloc with an old reschedule attempt, can be GCed 143 alloc := mock.Alloc() 144 alloc.EvalID = eval.ID 145 alloc.DesiredStatus = structs.AllocDesiredStatusRun 146 alloc.ClientStatus = structs.AllocClientStatusFailed 147 alloc.JobID = eval.JobID 148 alloc.TaskGroup = job.TaskGroups[0].Name 149 alloc.RescheduleTracker = &structs.RescheduleTracker{ 150 Events: []*structs.RescheduleEvent{ 151 { 152 RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), 153 PrevNodeID: uuid.Generate(), 154 PrevAllocID: uuid.Generate(), 155 }, 156 }, 157 } 158 159 // Insert another failed alloc with a recent reschedule attempt, can't be GCed 160 alloc2 := mock.Alloc() 161 alloc2.EvalID = eval.ID 162 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 163 alloc2.ClientStatus = structs.AllocClientStatusLost 164 alloc2.JobID = eval.JobID 165 alloc2.TaskGroup = job.TaskGroups[0].Name 166 alloc2.RescheduleTracker = &structs.RescheduleTracker{ 167 Events: []*structs.RescheduleEvent{ 168 { 169 RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(), 170 PrevNodeID: uuid.Generate(), 171 PrevAllocID: uuid.Generate(), 172 }, 173 }, 174 } 175 err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) 176 require.Nil(err) 177 178 // Update the time tables to make this work 179 tt := s1.fsm.TimeTable() 180 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 181 182 // Create a core scheduler 183 snap, err := state.Snapshot() 184 if err != nil { 185 t.Fatalf("err: %v", err) 186 } 187 core := NewCoreScheduler(s1, snap) 188 189 // Attempt the GC, job has all terminal allocs and one pending eval 190 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 191 err = core.Process(gc) 192 require.Nil(err) 193 194 // Eval should still exist 195 ws := memdb.NewWatchSet() 196 out, err := state.EvalByID(ws, eval.ID) 197 require.Nil(err) 198 require.Equal(eval.ID, out.ID) 199 200 outA, err := state.AllocByID(ws, alloc.ID) 201 require.Nil(err) 202 require.Nil(outA) 203 204 outA2, err := state.AllocByID(ws, alloc2.ID) 205 require.Nil(err) 206 require.Equal(alloc2.ID, outA2.ID) 207 208 } 209 210 // Tests GC behavior on stopped job with reschedulable allocs 211 func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) { 212 t.Parallel() 213 s1 := TestServer(t, nil) 214 defer s1.Shutdown() 215 testutil.WaitForLeader(t, s1.RPC) 216 require := require.New(t) 217 218 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 219 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 220 221 // Insert "dead" eval 222 state := s1.fsm.State() 223 eval := mock.Eval() 224 eval.Status = structs.EvalStatusFailed 225 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 226 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 227 require.Nil(err) 228 229 // Insert mock stopped job with default reschedule policy of 2 in 10 minutes 230 job := mock.Job() 231 job.ID = eval.JobID 232 job.Stop = true 233 234 err = state.UpsertJob(1001, job) 235 require.Nil(err) 236 237 // Insert failed alloc with a recent reschedule attempt 238 alloc := mock.Alloc() 239 alloc.EvalID = eval.ID 240 alloc.DesiredStatus = structs.AllocDesiredStatusRun 241 alloc.ClientStatus = structs.AllocClientStatusLost 242 alloc.JobID = eval.JobID 243 alloc.TaskGroup = job.TaskGroups[0].Name 244 alloc.RescheduleTracker = &structs.RescheduleTracker{ 245 Events: []*structs.RescheduleEvent{ 246 { 247 RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(), 248 PrevNodeID: uuid.Generate(), 249 PrevAllocID: uuid.Generate(), 250 }, 251 }, 252 } 253 err = state.UpsertAllocs(1001, []*structs.Allocation{alloc}) 254 require.Nil(err) 255 256 // Update the time tables to make this work 257 tt := s1.fsm.TimeTable() 258 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 259 260 // Create a core scheduler 261 snap, err := state.Snapshot() 262 if err != nil { 263 t.Fatalf("err: %v", err) 264 } 265 core := NewCoreScheduler(s1, snap) 266 267 // Attempt the GC 268 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 269 err = core.Process(gc) 270 require.Nil(err) 271 272 // Eval should not exist 273 ws := memdb.NewWatchSet() 274 out, err := state.EvalByID(ws, eval.ID) 275 require.Nil(err) 276 require.Nil(out) 277 278 // Alloc should not exist 279 outA, err := state.AllocByID(ws, alloc.ID) 280 require.Nil(err) 281 require.Nil(outA) 282 283 } 284 285 // An EvalGC should never reap a batch job that has not been stopped 286 func TestCoreScheduler_EvalGC_Batch(t *testing.T) { 287 t.Parallel() 288 s1 := TestServer(t, nil) 289 defer s1.Shutdown() 290 testutil.WaitForLeader(t, s1.RPC) 291 292 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 293 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 294 295 // Insert a "dead" job 296 state := s1.fsm.State() 297 job := mock.Job() 298 job.Type = structs.JobTypeBatch 299 job.Status = structs.JobStatusDead 300 err := state.UpsertJob(1000, job) 301 if err != nil { 302 t.Fatalf("err: %v", err) 303 } 304 305 // Insert "complete" eval 306 eval := mock.Eval() 307 eval.Status = structs.EvalStatusComplete 308 eval.Type = structs.JobTypeBatch 309 eval.JobID = job.ID 310 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 311 if err != nil { 312 t.Fatalf("err: %v", err) 313 } 314 315 // Insert "failed" alloc 316 alloc := mock.Alloc() 317 alloc.JobID = job.ID 318 alloc.EvalID = eval.ID 319 alloc.DesiredStatus = structs.AllocDesiredStatusStop 320 321 // Insert "lost" alloc 322 alloc2 := mock.Alloc() 323 alloc2.JobID = job.ID 324 alloc2.EvalID = eval.ID 325 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 326 alloc2.ClientStatus = structs.AllocClientStatusLost 327 328 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 329 if err != nil { 330 t.Fatalf("err: %v", err) 331 } 332 333 // Update the time tables to make this work 334 tt := s1.fsm.TimeTable() 335 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 336 337 // Create a core scheduler 338 snap, err := state.Snapshot() 339 if err != nil { 340 t.Fatalf("err: %v", err) 341 } 342 core := NewCoreScheduler(s1, snap) 343 344 // Attempt the GC 345 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 346 err = core.Process(gc) 347 if err != nil { 348 t.Fatalf("err: %v", err) 349 } 350 351 // Nothing should be gone 352 ws := memdb.NewWatchSet() 353 out, err := state.EvalByID(ws, eval.ID) 354 if err != nil { 355 t.Fatalf("err: %v", err) 356 } 357 if out == nil { 358 t.Fatalf("bad: %v", out) 359 } 360 361 outA, err := state.AllocByID(ws, alloc.ID) 362 if err != nil { 363 t.Fatalf("err: %v", err) 364 } 365 if outA == nil { 366 t.Fatalf("bad: %v", outA) 367 } 368 369 outA2, err := state.AllocByID(ws, alloc2.ID) 370 if err != nil { 371 t.Fatalf("err: %v", err) 372 } 373 if outA2 == nil { 374 t.Fatalf("bad: %v", outA2) 375 } 376 377 outB, err := state.JobByID(ws, job.Namespace, job.ID) 378 if err != nil { 379 t.Fatalf("err: %v", err) 380 } 381 if outB == nil { 382 t.Fatalf("bad: %v", outB) 383 } 384 } 385 386 // An EvalGC should reap a batch job that has been stopped 387 func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) { 388 t.Parallel() 389 s1 := TestServer(t, nil) 390 defer s1.Shutdown() 391 testutil.WaitForLeader(t, s1.RPC) 392 393 require := require.New(t) 394 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 395 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 396 397 // Create a "dead" job 398 state := s1.fsm.State() 399 job := mock.Job() 400 job.Type = structs.JobTypeBatch 401 job.Status = structs.JobStatusDead 402 job.Stop = true 403 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 404 Attempts: 0, 405 Interval: 0 * time.Second, 406 } 407 err := state.UpsertJob(1001, job) 408 require.Nil(err) 409 410 // Insert "complete" eval 411 eval := mock.Eval() 412 eval.Status = structs.EvalStatusComplete 413 eval.Type = structs.JobTypeBatch 414 eval.JobID = job.ID 415 err = state.UpsertEvals(1002, []*structs.Evaluation{eval}) 416 require.Nil(err) 417 418 // Insert "failed" alloc 419 alloc := mock.Alloc() 420 alloc.JobID = job.ID 421 alloc.EvalID = eval.ID 422 alloc.TaskGroup = job.TaskGroups[0].Name 423 alloc.DesiredStatus = structs.AllocDesiredStatusStop 424 425 // Insert "lost" alloc 426 alloc2 := mock.Alloc() 427 alloc2.JobID = job.ID 428 alloc2.EvalID = eval.ID 429 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 430 alloc2.ClientStatus = structs.AllocClientStatusLost 431 alloc2.TaskGroup = job.TaskGroups[0].Name 432 433 err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2}) 434 if err != nil { 435 t.Fatalf("err: %v", err) 436 } 437 438 // Update the time tables to make this work 439 tt := s1.fsm.TimeTable() 440 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 441 442 // Create a core scheduler 443 snap, err := state.Snapshot() 444 if err != nil { 445 t.Fatalf("err: %v", err) 446 } 447 core := NewCoreScheduler(s1, snap) 448 449 // Attempt the GC 450 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 451 err = core.Process(gc) 452 if err != nil { 453 t.Fatalf("err: %v", err) 454 } 455 456 // Everything should be gone 457 ws := memdb.NewWatchSet() 458 out, err := state.EvalByID(ws, eval.ID) 459 if err != nil { 460 t.Fatalf("err: %v", err) 461 } 462 if out != nil { 463 t.Fatalf("bad: %v", out) 464 } 465 466 outA, err := state.AllocByID(ws, alloc.ID) 467 if err != nil { 468 t.Fatalf("err: %v", err) 469 } 470 if outA != nil { 471 t.Fatalf("bad: %v", outA) 472 } 473 474 outA2, err := state.AllocByID(ws, alloc2.ID) 475 if err != nil { 476 t.Fatalf("err: %v", err) 477 } 478 if outA2 != nil { 479 t.Fatalf("bad: %v", outA2) 480 } 481 } 482 483 func TestCoreScheduler_EvalGC_Partial(t *testing.T) { 484 t.Parallel() 485 s1 := TestServer(t, nil) 486 defer s1.Shutdown() 487 testutil.WaitForLeader(t, s1.RPC) 488 require := require.New(t) 489 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 490 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 491 492 // Insert "dead" eval 493 state := s1.fsm.State() 494 eval := mock.Eval() 495 eval.Status = structs.EvalStatusComplete 496 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 497 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 498 if err != nil { 499 t.Fatalf("err: %v", err) 500 } 501 502 // Create mock job with id same as eval 503 job := mock.Job() 504 job.ID = eval.JobID 505 506 // Insert "dead" alloc 507 alloc := mock.Alloc() 508 alloc.JobID = job.ID 509 alloc.EvalID = eval.ID 510 alloc.DesiredStatus = structs.AllocDesiredStatusStop 511 alloc.TaskGroup = job.TaskGroups[0].Name 512 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 513 514 // Insert "lost" alloc 515 alloc2 := mock.Alloc() 516 alloc2.JobID = job.ID 517 alloc2.EvalID = eval.ID 518 alloc2.TaskGroup = job.TaskGroups[0].Name 519 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 520 alloc2.ClientStatus = structs.AllocClientStatusLost 521 522 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 523 if err != nil { 524 t.Fatalf("err: %v", err) 525 } 526 527 // Insert "running" alloc 528 alloc3 := mock.Alloc() 529 alloc3.EvalID = eval.ID 530 alloc3.JobID = job.ID 531 state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID)) 532 err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3}) 533 if err != nil { 534 t.Fatalf("err: %v", err) 535 } 536 537 // Insert mock job with rescheduling disabled 538 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 539 Attempts: 0, 540 Interval: 0 * time.Second, 541 } 542 err = state.UpsertJob(1001, job) 543 require.Nil(err) 544 545 // Update the time tables to make this work 546 tt := s1.fsm.TimeTable() 547 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) 548 549 // Create a core scheduler 550 snap, err := state.Snapshot() 551 if err != nil { 552 t.Fatalf("err: %v", err) 553 } 554 core := NewCoreScheduler(s1, snap) 555 556 // Attempt the GC 557 gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) 558 err = core.Process(gc) 559 if err != nil { 560 t.Fatalf("err: %v", err) 561 } 562 563 // Should not be gone 564 ws := memdb.NewWatchSet() 565 out, err := state.EvalByID(ws, eval.ID) 566 if err != nil { 567 t.Fatalf("err: %v", err) 568 } 569 if out == nil { 570 t.Fatalf("bad: %v", out) 571 } 572 573 outA, err := state.AllocByID(ws, alloc3.ID) 574 if err != nil { 575 t.Fatalf("err: %v", err) 576 } 577 if outA == nil { 578 t.Fatalf("bad: %v", outA) 579 } 580 581 // Should be gone 582 outB, err := state.AllocByID(ws, alloc.ID) 583 if err != nil { 584 t.Fatalf("err: %v", err) 585 } 586 if outB != nil { 587 t.Fatalf("bad: %v", outB) 588 } 589 590 outC, err := state.AllocByID(ws, alloc2.ID) 591 if err != nil { 592 t.Fatalf("err: %v", err) 593 } 594 if outC != nil { 595 t.Fatalf("bad: %v", outC) 596 } 597 } 598 599 func TestCoreScheduler_EvalGC_Force(t *testing.T) { 600 t.Parallel() 601 for _, withAcl := range []bool{false, true} { 602 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 603 require := require.New(t) 604 var server *Server 605 if withAcl { 606 server, _ = TestACLServer(t, nil) 607 } else { 608 server = TestServer(t, nil) 609 } 610 defer server.Shutdown() 611 testutil.WaitForLeader(t, server.RPC) 612 613 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 614 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 615 616 // Insert "dead" eval 617 state := server.fsm.State() 618 eval := mock.Eval() 619 eval.Status = structs.EvalStatusFailed 620 state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) 621 err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) 622 if err != nil { 623 t.Fatalf("err: %v", err) 624 } 625 626 // Insert mock job with rescheduling disabled 627 job := mock.Job() 628 job.ID = eval.JobID 629 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 630 Attempts: 0, 631 Interval: 0 * time.Second, 632 } 633 err = state.UpsertJob(1001, job) 634 require.Nil(err) 635 636 // Insert "dead" alloc 637 alloc := mock.Alloc() 638 alloc.EvalID = eval.ID 639 alloc.DesiredStatus = structs.AllocDesiredStatusStop 640 alloc.TaskGroup = job.TaskGroups[0].Name 641 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 642 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc}) 643 if err != nil { 644 t.Fatalf("err: %v", err) 645 } 646 647 // Create a core scheduler 648 snap, err := state.Snapshot() 649 if err != nil { 650 t.Fatalf("err: %v", err) 651 } 652 core := NewCoreScheduler(server, snap) 653 654 // Attempt the GC 655 gc := server.coreJobEval(structs.CoreJobForceGC, 1002) 656 err = core.Process(gc) 657 if err != nil { 658 t.Fatalf("err: %v", err) 659 } 660 661 // Should be gone 662 ws := memdb.NewWatchSet() 663 out, err := state.EvalByID(ws, eval.ID) 664 if err != nil { 665 t.Fatalf("err: %v", err) 666 } 667 if out != nil { 668 t.Fatalf("bad: %v", out) 669 } 670 671 outA, err := state.AllocByID(ws, alloc.ID) 672 if err != nil { 673 t.Fatalf("err: %v", err) 674 } 675 if outA != nil { 676 t.Fatalf("bad: %v", outA) 677 } 678 }) 679 } 680 } 681 682 func TestCoreScheduler_NodeGC(t *testing.T) { 683 t.Parallel() 684 for _, withAcl := range []bool{false, true} { 685 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 686 var server *Server 687 if withAcl { 688 server, _ = TestACLServer(t, nil) 689 } else { 690 server = TestServer(t, nil) 691 } 692 defer server.Shutdown() 693 testutil.WaitForLeader(t, server.RPC) 694 695 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 696 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 697 698 // Insert "dead" node 699 state := server.fsm.State() 700 node := mock.Node() 701 node.Status = structs.NodeStatusDown 702 err := state.UpsertNode(1000, node) 703 if err != nil { 704 t.Fatalf("err: %v", err) 705 } 706 707 // Update the time tables to make this work 708 tt := server.fsm.TimeTable() 709 tt.Witness(2000, time.Now().UTC().Add(-1*server.config.NodeGCThreshold)) 710 711 // Create a core scheduler 712 snap, err := state.Snapshot() 713 if err != nil { 714 t.Fatalf("err: %v", err) 715 } 716 core := NewCoreScheduler(server, snap) 717 718 // Attempt the GC 719 gc := server.coreJobEval(structs.CoreJobNodeGC, 2000) 720 err = core.Process(gc) 721 if err != nil { 722 t.Fatalf("err: %v", err) 723 } 724 725 // Should be gone 726 ws := memdb.NewWatchSet() 727 out, err := state.NodeByID(ws, node.ID) 728 if err != nil { 729 t.Fatalf("err: %v", err) 730 } 731 if out != nil { 732 t.Fatalf("bad: %v", out) 733 } 734 }) 735 } 736 } 737 738 func TestCoreScheduler_NodeGC_TerminalAllocs(t *testing.T) { 739 t.Parallel() 740 s1 := TestServer(t, nil) 741 defer s1.Shutdown() 742 testutil.WaitForLeader(t, s1.RPC) 743 744 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 745 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 746 747 // Insert "dead" node 748 state := s1.fsm.State() 749 node := mock.Node() 750 node.Status = structs.NodeStatusDown 751 err := state.UpsertNode(1000, node) 752 if err != nil { 753 t.Fatalf("err: %v", err) 754 } 755 756 // Insert a terminal alloc on that node 757 alloc := mock.Alloc() 758 alloc.DesiredStatus = structs.AllocDesiredStatusStop 759 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 760 if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil { 761 t.Fatalf("err: %v", err) 762 } 763 764 // Update the time tables to make this work 765 tt := s1.fsm.TimeTable() 766 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold)) 767 768 // Create a core scheduler 769 snap, err := state.Snapshot() 770 if err != nil { 771 t.Fatalf("err: %v", err) 772 } 773 core := NewCoreScheduler(s1, snap) 774 775 // Attempt the GC 776 gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000) 777 err = core.Process(gc) 778 if err != nil { 779 t.Fatalf("err: %v", err) 780 } 781 782 // Should be gone 783 ws := memdb.NewWatchSet() 784 out, err := state.NodeByID(ws, node.ID) 785 if err != nil { 786 t.Fatalf("err: %v", err) 787 } 788 if out != nil { 789 t.Fatalf("bad: %v", out) 790 } 791 } 792 793 func TestCoreScheduler_NodeGC_RunningAllocs(t *testing.T) { 794 t.Parallel() 795 s1 := TestServer(t, nil) 796 defer s1.Shutdown() 797 testutil.WaitForLeader(t, s1.RPC) 798 799 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 800 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 801 802 // Insert "dead" node 803 state := s1.fsm.State() 804 node := mock.Node() 805 node.Status = structs.NodeStatusDown 806 err := state.UpsertNode(1000, node) 807 if err != nil { 808 t.Fatalf("err: %v", err) 809 } 810 811 // Insert a running alloc on that node 812 alloc := mock.Alloc() 813 alloc.NodeID = node.ID 814 alloc.DesiredStatus = structs.AllocDesiredStatusRun 815 alloc.ClientStatus = structs.AllocClientStatusRunning 816 state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) 817 if err := state.UpsertAllocs(1002, []*structs.Allocation{alloc}); err != nil { 818 t.Fatalf("err: %v", err) 819 } 820 821 // Update the time tables to make this work 822 tt := s1.fsm.TimeTable() 823 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.NodeGCThreshold)) 824 825 // Create a core scheduler 826 snap, err := state.Snapshot() 827 if err != nil { 828 t.Fatalf("err: %v", err) 829 } 830 core := NewCoreScheduler(s1, snap) 831 832 // Attempt the GC 833 gc := s1.coreJobEval(structs.CoreJobNodeGC, 2000) 834 err = core.Process(gc) 835 if err != nil { 836 t.Fatalf("err: %v", err) 837 } 838 839 // Should still be here 840 ws := memdb.NewWatchSet() 841 out, err := state.NodeByID(ws, node.ID) 842 if err != nil { 843 t.Fatalf("err: %v", err) 844 } 845 if out == nil { 846 t.Fatalf("bad: %v", out) 847 } 848 } 849 850 func TestCoreScheduler_NodeGC_Force(t *testing.T) { 851 t.Parallel() 852 s1 := TestServer(t, nil) 853 defer s1.Shutdown() 854 testutil.WaitForLeader(t, s1.RPC) 855 856 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 857 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 858 859 // Insert "dead" node 860 state := s1.fsm.State() 861 node := mock.Node() 862 node.Status = structs.NodeStatusDown 863 err := state.UpsertNode(1000, node) 864 if err != nil { 865 t.Fatalf("err: %v", err) 866 } 867 868 // Create a core scheduler 869 snap, err := state.Snapshot() 870 if err != nil { 871 t.Fatalf("err: %v", err) 872 } 873 core := NewCoreScheduler(s1, snap) 874 875 // Attempt the GC 876 gc := s1.coreJobEval(structs.CoreJobForceGC, 1000) 877 err = core.Process(gc) 878 if err != nil { 879 t.Fatalf("err: %v", err) 880 } 881 882 // Should be gone 883 ws := memdb.NewWatchSet() 884 out, err := state.NodeByID(ws, node.ID) 885 if err != nil { 886 t.Fatalf("err: %v", err) 887 } 888 if out != nil { 889 t.Fatalf("bad: %v", out) 890 } 891 } 892 893 func TestCoreScheduler_JobGC_OutstandingEvals(t *testing.T) { 894 t.Parallel() 895 s1 := TestServer(t, nil) 896 defer s1.Shutdown() 897 testutil.WaitForLeader(t, s1.RPC) 898 899 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 900 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 901 902 // Insert job. 903 state := s1.fsm.State() 904 job := mock.Job() 905 job.Type = structs.JobTypeBatch 906 job.Status = structs.JobStatusDead 907 err := state.UpsertJob(1000, job) 908 if err != nil { 909 t.Fatalf("err: %v", err) 910 } 911 912 // Insert two evals, one terminal and one not 913 eval := mock.Eval() 914 eval.JobID = job.ID 915 eval.Status = structs.EvalStatusComplete 916 917 eval2 := mock.Eval() 918 eval2.JobID = job.ID 919 eval2.Status = structs.EvalStatusPending 920 err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2}) 921 if err != nil { 922 t.Fatalf("err: %v", err) 923 } 924 925 // Update the time tables to make this work 926 tt := s1.fsm.TimeTable() 927 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 928 929 // Create a core scheduler 930 snap, err := state.Snapshot() 931 if err != nil { 932 t.Fatalf("err: %v", err) 933 } 934 core := NewCoreScheduler(s1, snap) 935 936 // Attempt the GC 937 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 938 err = core.Process(gc) 939 if err != nil { 940 t.Fatalf("err: %v", err) 941 } 942 943 // Should still exist 944 ws := memdb.NewWatchSet() 945 out, err := state.JobByID(ws, job.Namespace, job.ID) 946 if err != nil { 947 t.Fatalf("err: %v", err) 948 } 949 if out == nil { 950 t.Fatalf("bad: %v", out) 951 } 952 953 outE, err := state.EvalByID(ws, eval.ID) 954 if err != nil { 955 t.Fatalf("err: %v", err) 956 } 957 if outE == nil { 958 t.Fatalf("bad: %v", outE) 959 } 960 961 outE2, err := state.EvalByID(ws, eval2.ID) 962 if err != nil { 963 t.Fatalf("err: %v", err) 964 } 965 if outE2 == nil { 966 t.Fatalf("bad: %v", outE2) 967 } 968 969 // Update the second eval to be terminal 970 eval2.Status = structs.EvalStatusComplete 971 err = state.UpsertEvals(1003, []*structs.Evaluation{eval2}) 972 if err != nil { 973 t.Fatalf("err: %v", err) 974 } 975 976 // Create a core scheduler 977 snap, err = state.Snapshot() 978 if err != nil { 979 t.Fatalf("err: %v", err) 980 } 981 core = NewCoreScheduler(s1, snap) 982 983 // Attempt the GC 984 gc = s1.coreJobEval(structs.CoreJobJobGC, 2000) 985 err = core.Process(gc) 986 if err != nil { 987 t.Fatalf("err: %v", err) 988 } 989 990 // Should not still exist 991 out, err = state.JobByID(ws, job.Namespace, job.ID) 992 if err != nil { 993 t.Fatalf("err: %v", err) 994 } 995 if out != nil { 996 t.Fatalf("bad: %v", out) 997 } 998 999 outE, err = state.EvalByID(ws, eval.ID) 1000 if err != nil { 1001 t.Fatalf("err: %v", err) 1002 } 1003 if outE != nil { 1004 t.Fatalf("bad: %v", outE) 1005 } 1006 1007 outE2, err = state.EvalByID(ws, eval2.ID) 1008 if err != nil { 1009 t.Fatalf("err: %v", err) 1010 } 1011 if outE2 != nil { 1012 t.Fatalf("bad: %v", outE2) 1013 } 1014 } 1015 1016 func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) { 1017 t.Parallel() 1018 s1 := TestServer(t, nil) 1019 defer s1.Shutdown() 1020 testutil.WaitForLeader(t, s1.RPC) 1021 1022 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1023 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1024 1025 // Insert job. 1026 state := s1.fsm.State() 1027 job := mock.Job() 1028 job.Type = structs.JobTypeBatch 1029 job.Status = structs.JobStatusDead 1030 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 1031 Attempts: 0, 1032 Interval: 0 * time.Second, 1033 } 1034 err := state.UpsertJob(1000, job) 1035 if err != nil { 1036 t.Fatalf("err: %v", err) 1037 } 1038 1039 // Insert an eval 1040 eval := mock.Eval() 1041 eval.JobID = job.ID 1042 eval.Status = structs.EvalStatusComplete 1043 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 1044 if err != nil { 1045 t.Fatalf("err: %v", err) 1046 } 1047 1048 // Insert two allocs, one terminal and one not 1049 alloc := mock.Alloc() 1050 alloc.JobID = job.ID 1051 alloc.EvalID = eval.ID 1052 alloc.DesiredStatus = structs.AllocDesiredStatusRun 1053 alloc.ClientStatus = structs.AllocClientStatusComplete 1054 alloc.TaskGroup = job.TaskGroups[0].Name 1055 1056 alloc2 := mock.Alloc() 1057 alloc2.JobID = job.ID 1058 alloc2.EvalID = eval.ID 1059 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 1060 alloc2.ClientStatus = structs.AllocClientStatusRunning 1061 alloc2.TaskGroup = job.TaskGroups[0].Name 1062 1063 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 1064 if err != nil { 1065 t.Fatalf("err: %v", err) 1066 } 1067 1068 // Update the time tables to make this work 1069 tt := s1.fsm.TimeTable() 1070 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1071 1072 // Create a core scheduler 1073 snap, err := state.Snapshot() 1074 if err != nil { 1075 t.Fatalf("err: %v", err) 1076 } 1077 core := NewCoreScheduler(s1, snap) 1078 1079 // Attempt the GC 1080 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1081 err = core.Process(gc) 1082 if err != nil { 1083 t.Fatalf("err: %v", err) 1084 } 1085 1086 // Should still exist 1087 ws := memdb.NewWatchSet() 1088 out, err := state.JobByID(ws, job.Namespace, job.ID) 1089 if err != nil { 1090 t.Fatalf("err: %v", err) 1091 } 1092 if out == nil { 1093 t.Fatalf("bad: %v", out) 1094 } 1095 1096 outA, err := state.AllocByID(ws, alloc.ID) 1097 if err != nil { 1098 t.Fatalf("err: %v", err) 1099 } 1100 if outA == nil { 1101 t.Fatalf("bad: %v", outA) 1102 } 1103 1104 outA2, err := state.AllocByID(ws, alloc2.ID) 1105 if err != nil { 1106 t.Fatalf("err: %v", err) 1107 } 1108 if outA2 == nil { 1109 t.Fatalf("bad: %v", outA2) 1110 } 1111 1112 // Update the second alloc to be terminal 1113 alloc2.ClientStatus = structs.AllocClientStatusComplete 1114 err = state.UpsertAllocs(1003, []*structs.Allocation{alloc2}) 1115 if err != nil { 1116 t.Fatalf("err: %v", err) 1117 } 1118 1119 // Create a core scheduler 1120 snap, err = state.Snapshot() 1121 if err != nil { 1122 t.Fatalf("err: %v", err) 1123 } 1124 core = NewCoreScheduler(s1, snap) 1125 1126 // Attempt the GC 1127 gc = s1.coreJobEval(structs.CoreJobJobGC, 2000) 1128 err = core.Process(gc) 1129 if err != nil { 1130 t.Fatalf("err: %v", err) 1131 } 1132 1133 // Should not still exist 1134 out, err = state.JobByID(ws, job.Namespace, job.ID) 1135 if err != nil { 1136 t.Fatalf("err: %v", err) 1137 } 1138 if out != nil { 1139 t.Fatalf("bad: %v", out) 1140 } 1141 1142 outA, err = state.AllocByID(ws, alloc.ID) 1143 if err != nil { 1144 t.Fatalf("err: %v", err) 1145 } 1146 if outA != nil { 1147 t.Fatalf("bad: %v", outA) 1148 } 1149 1150 outA2, err = state.AllocByID(ws, alloc2.ID) 1151 if err != nil { 1152 t.Fatalf("err: %v", err) 1153 } 1154 if outA2 != nil { 1155 t.Fatalf("bad: %v", outA2) 1156 } 1157 } 1158 1159 // This test ensures that batch jobs are GC'd in one shot, meaning it all 1160 // allocs/evals and job or nothing 1161 func TestCoreScheduler_JobGC_OneShot(t *testing.T) { 1162 t.Parallel() 1163 s1 := TestServer(t, nil) 1164 defer s1.Shutdown() 1165 testutil.WaitForLeader(t, s1.RPC) 1166 1167 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1168 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1169 1170 // Insert job. 1171 state := s1.fsm.State() 1172 job := mock.Job() 1173 job.Type = structs.JobTypeBatch 1174 err := state.UpsertJob(1000, job) 1175 if err != nil { 1176 t.Fatalf("err: %v", err) 1177 } 1178 1179 // Insert two complete evals 1180 eval := mock.Eval() 1181 eval.JobID = job.ID 1182 eval.Status = structs.EvalStatusComplete 1183 1184 eval2 := mock.Eval() 1185 eval2.JobID = job.ID 1186 eval2.Status = structs.EvalStatusComplete 1187 1188 err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2}) 1189 if err != nil { 1190 t.Fatalf("err: %v", err) 1191 } 1192 1193 // Insert one complete alloc and one running on distinct evals 1194 alloc := mock.Alloc() 1195 alloc.JobID = job.ID 1196 alloc.EvalID = eval.ID 1197 alloc.DesiredStatus = structs.AllocDesiredStatusStop 1198 1199 alloc2 := mock.Alloc() 1200 alloc2.JobID = job.ID 1201 alloc2.EvalID = eval2.ID 1202 alloc2.DesiredStatus = structs.AllocDesiredStatusRun 1203 1204 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) 1205 if err != nil { 1206 t.Fatalf("err: %v", err) 1207 } 1208 1209 // Force the jobs state to dead 1210 job.Status = structs.JobStatusDead 1211 1212 // Update the time tables to make this work 1213 tt := s1.fsm.TimeTable() 1214 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1215 1216 // Create a core scheduler 1217 snap, err := state.Snapshot() 1218 if err != nil { 1219 t.Fatalf("err: %v", err) 1220 } 1221 core := NewCoreScheduler(s1, snap) 1222 1223 // Attempt the GC 1224 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1225 err = core.Process(gc) 1226 if err != nil { 1227 t.Fatalf("err: %v", err) 1228 } 1229 1230 // Should still exist 1231 ws := memdb.NewWatchSet() 1232 out, err := state.JobByID(ws, job.Namespace, job.ID) 1233 if err != nil { 1234 t.Fatalf("err: %v", err) 1235 } 1236 if out == nil { 1237 t.Fatalf("bad: %v", out) 1238 } 1239 1240 outE, err := state.EvalByID(ws, eval.ID) 1241 if err != nil { 1242 t.Fatalf("err: %v", err) 1243 } 1244 if outE == nil { 1245 t.Fatalf("bad: %v", outE) 1246 } 1247 1248 outE2, err := state.EvalByID(ws, eval2.ID) 1249 if err != nil { 1250 t.Fatalf("err: %v", err) 1251 } 1252 if outE2 == nil { 1253 t.Fatalf("bad: %v", outE2) 1254 } 1255 1256 outA, err := state.AllocByID(ws, alloc.ID) 1257 if err != nil { 1258 t.Fatalf("err: %v", err) 1259 } 1260 if outA == nil { 1261 t.Fatalf("bad: %v", outA) 1262 } 1263 outA2, err := state.AllocByID(ws, alloc2.ID) 1264 if err != nil { 1265 t.Fatalf("err: %v", err) 1266 } 1267 if outA2 == nil { 1268 t.Fatalf("bad: %v", outA2) 1269 } 1270 } 1271 1272 // This test ensures that stopped jobs are GCd 1273 func TestCoreScheduler_JobGC_Stopped(t *testing.T) { 1274 t.Parallel() 1275 s1 := TestServer(t, nil) 1276 defer s1.Shutdown() 1277 testutil.WaitForLeader(t, s1.RPC) 1278 1279 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1280 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1281 1282 // Insert job. 1283 state := s1.fsm.State() 1284 job := mock.Job() 1285 job.Stop = true 1286 job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ 1287 Attempts: 0, 1288 Interval: 0 * time.Second, 1289 } 1290 err := state.UpsertJob(1000, job) 1291 if err != nil { 1292 t.Fatalf("err: %v", err) 1293 } 1294 1295 // Insert two complete evals 1296 eval := mock.Eval() 1297 eval.JobID = job.ID 1298 eval.Status = structs.EvalStatusComplete 1299 1300 eval2 := mock.Eval() 1301 eval2.JobID = job.ID 1302 eval2.Status = structs.EvalStatusComplete 1303 1304 err = state.UpsertEvals(1001, []*structs.Evaluation{eval, eval2}) 1305 if err != nil { 1306 t.Fatalf("err: %v", err) 1307 } 1308 1309 // Insert one complete alloc 1310 alloc := mock.Alloc() 1311 alloc.JobID = job.ID 1312 alloc.EvalID = eval.ID 1313 alloc.DesiredStatus = structs.AllocDesiredStatusStop 1314 alloc.TaskGroup = job.TaskGroups[0].Name 1315 err = state.UpsertAllocs(1002, []*structs.Allocation{alloc}) 1316 if err != nil { 1317 t.Fatalf("err: %v", err) 1318 } 1319 1320 // Update the time tables to make this work 1321 tt := s1.fsm.TimeTable() 1322 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.JobGCThreshold)) 1323 1324 // Create a core scheduler 1325 snap, err := state.Snapshot() 1326 if err != nil { 1327 t.Fatalf("err: %v", err) 1328 } 1329 core := NewCoreScheduler(s1, snap) 1330 1331 // Attempt the GC 1332 gc := s1.coreJobEval(structs.CoreJobJobGC, 2000) 1333 err = core.Process(gc) 1334 if err != nil { 1335 t.Fatalf("err: %v", err) 1336 } 1337 1338 // Shouldn't still exist 1339 ws := memdb.NewWatchSet() 1340 out, err := state.JobByID(ws, job.Namespace, job.ID) 1341 if err != nil { 1342 t.Fatalf("err: %v", err) 1343 } 1344 if out != nil { 1345 t.Fatalf("bad: %v", out) 1346 } 1347 1348 outE, err := state.EvalByID(ws, eval.ID) 1349 if err != nil { 1350 t.Fatalf("err: %v", err) 1351 } 1352 if outE != nil { 1353 t.Fatalf("bad: %v", outE) 1354 } 1355 1356 outE2, err := state.EvalByID(ws, eval2.ID) 1357 if err != nil { 1358 t.Fatalf("err: %v", err) 1359 } 1360 if outE2 != nil { 1361 t.Fatalf("bad: %v", outE2) 1362 } 1363 1364 outA, err := state.AllocByID(ws, alloc.ID) 1365 if err != nil { 1366 t.Fatalf("err: %v", err) 1367 } 1368 if outA != nil { 1369 t.Fatalf("bad: %v", outA) 1370 } 1371 } 1372 1373 func TestCoreScheduler_JobGC_Force(t *testing.T) { 1374 t.Parallel() 1375 for _, withAcl := range []bool{false, true} { 1376 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 1377 var server *Server 1378 if withAcl { 1379 server, _ = TestACLServer(t, nil) 1380 } else { 1381 server = TestServer(t, nil) 1382 } 1383 defer server.Shutdown() 1384 testutil.WaitForLeader(t, server.RPC) 1385 1386 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1387 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1388 1389 // Insert job. 1390 state := server.fsm.State() 1391 job := mock.Job() 1392 job.Type = structs.JobTypeBatch 1393 job.Status = structs.JobStatusDead 1394 err := state.UpsertJob(1000, job) 1395 if err != nil { 1396 t.Fatalf("err: %v", err) 1397 } 1398 1399 // Insert a terminal eval 1400 eval := mock.Eval() 1401 eval.JobID = job.ID 1402 eval.Status = structs.EvalStatusComplete 1403 err = state.UpsertEvals(1001, []*structs.Evaluation{eval}) 1404 if err != nil { 1405 t.Fatalf("err: %v", err) 1406 } 1407 1408 // Create a core scheduler 1409 snap, err := state.Snapshot() 1410 if err != nil { 1411 t.Fatalf("err: %v", err) 1412 } 1413 core := NewCoreScheduler(server, snap) 1414 1415 // Attempt the GC 1416 gc := server.coreJobEval(structs.CoreJobForceGC, 1002) 1417 err = core.Process(gc) 1418 if err != nil { 1419 t.Fatalf("err: %v", err) 1420 } 1421 1422 // Shouldn't still exist 1423 ws := memdb.NewWatchSet() 1424 out, err := state.JobByID(ws, job.Namespace, job.ID) 1425 if err != nil { 1426 t.Fatalf("err: %v", err) 1427 } 1428 if out != nil { 1429 t.Fatalf("bad: %v", out) 1430 } 1431 1432 outE, err := state.EvalByID(ws, eval.ID) 1433 if err != nil { 1434 t.Fatalf("err: %v", err) 1435 } 1436 if outE != nil { 1437 t.Fatalf("bad: %v", outE) 1438 } 1439 }) 1440 } 1441 } 1442 1443 // This test ensures parameterized jobs only get gc'd when stopped 1444 func TestCoreScheduler_JobGC_Parameterized(t *testing.T) { 1445 t.Parallel() 1446 s1 := TestServer(t, nil) 1447 defer s1.Shutdown() 1448 testutil.WaitForLeader(t, s1.RPC) 1449 1450 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1451 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1452 1453 // Insert a parameterized job. 1454 state := s1.fsm.State() 1455 job := mock.Job() 1456 job.Type = structs.JobTypeBatch 1457 job.Status = structs.JobStatusRunning 1458 job.ParameterizedJob = &structs.ParameterizedJobConfig{ 1459 Payload: structs.DispatchPayloadRequired, 1460 } 1461 err := state.UpsertJob(1000, job) 1462 if err != nil { 1463 t.Fatalf("err: %v", err) 1464 } 1465 1466 // Create a core scheduler 1467 snap, err := state.Snapshot() 1468 if err != nil { 1469 t.Fatalf("err: %v", err) 1470 } 1471 core := NewCoreScheduler(s1, snap) 1472 1473 // Attempt the GC 1474 gc := s1.coreJobEval(structs.CoreJobForceGC, 1002) 1475 err = core.Process(gc) 1476 if err != nil { 1477 t.Fatalf("err: %v", err) 1478 } 1479 1480 // Should still exist 1481 ws := memdb.NewWatchSet() 1482 out, err := state.JobByID(ws, job.Namespace, job.ID) 1483 if err != nil { 1484 t.Fatalf("err: %v", err) 1485 } 1486 if out == nil { 1487 t.Fatalf("bad: %v", out) 1488 } 1489 1490 // Mark the job as stopped and try again 1491 job2 := job.Copy() 1492 job2.Stop = true 1493 err = state.UpsertJob(2000, job2) 1494 if err != nil { 1495 t.Fatalf("err: %v", err) 1496 } 1497 1498 // Create a core scheduler 1499 snap, err = state.Snapshot() 1500 if err != nil { 1501 t.Fatalf("err: %v", err) 1502 } 1503 core = NewCoreScheduler(s1, snap) 1504 1505 // Attempt the GC 1506 gc = s1.coreJobEval(structs.CoreJobForceGC, 2002) 1507 err = core.Process(gc) 1508 if err != nil { 1509 t.Fatalf("err: %v", err) 1510 } 1511 1512 // Should not exist 1513 out, err = state.JobByID(ws, job.Namespace, job.ID) 1514 if err != nil { 1515 t.Fatalf("err: %v", err) 1516 } 1517 if out != nil { 1518 t.Fatalf("bad: %+v", out) 1519 } 1520 } 1521 1522 // This test ensures periodic jobs don't get GCd til they are stopped 1523 func TestCoreScheduler_JobGC_Periodic(t *testing.T) { 1524 t.Parallel() 1525 1526 s1 := TestServer(t, nil) 1527 defer s1.Shutdown() 1528 testutil.WaitForLeader(t, s1.RPC) 1529 1530 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1531 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1532 1533 // Insert a parameterized job. 1534 state := s1.fsm.State() 1535 job := mock.PeriodicJob() 1536 err := state.UpsertJob(1000, job) 1537 if err != nil { 1538 t.Fatalf("err: %v", err) 1539 } 1540 1541 // Create a core scheduler 1542 snap, err := state.Snapshot() 1543 if err != nil { 1544 t.Fatalf("err: %v", err) 1545 } 1546 core := NewCoreScheduler(s1, snap) 1547 1548 // Attempt the GC 1549 gc := s1.coreJobEval(structs.CoreJobForceGC, 1002) 1550 err = core.Process(gc) 1551 if err != nil { 1552 t.Fatalf("err: %v", err) 1553 } 1554 1555 // Should still exist 1556 ws := memdb.NewWatchSet() 1557 out, err := state.JobByID(ws, job.Namespace, job.ID) 1558 if err != nil { 1559 t.Fatalf("err: %v", err) 1560 } 1561 if out == nil { 1562 t.Fatalf("bad: %v", out) 1563 } 1564 1565 // Mark the job as stopped and try again 1566 job2 := job.Copy() 1567 job2.Stop = true 1568 err = state.UpsertJob(2000, job2) 1569 if err != nil { 1570 t.Fatalf("err: %v", err) 1571 } 1572 1573 // Create a core scheduler 1574 snap, err = state.Snapshot() 1575 if err != nil { 1576 t.Fatalf("err: %v", err) 1577 } 1578 core = NewCoreScheduler(s1, snap) 1579 1580 // Attempt the GC 1581 gc = s1.coreJobEval(structs.CoreJobForceGC, 2002) 1582 err = core.Process(gc) 1583 if err != nil { 1584 t.Fatalf("err: %v", err) 1585 } 1586 1587 // Should not exist 1588 out, err = state.JobByID(ws, job.Namespace, job.ID) 1589 if err != nil { 1590 t.Fatalf("err: %v", err) 1591 } 1592 if out != nil { 1593 t.Fatalf("bad: %+v", out) 1594 } 1595 } 1596 1597 func TestCoreScheduler_DeploymentGC(t *testing.T) { 1598 t.Parallel() 1599 s1 := TestServer(t, nil) 1600 defer s1.Shutdown() 1601 testutil.WaitForLeader(t, s1.RPC) 1602 assert := assert.New(t) 1603 1604 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1605 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1606 1607 // Insert an active, terminal, and terminal with allocations edeployment 1608 state := s1.fsm.State() 1609 d1, d2, d3 := mock.Deployment(), mock.Deployment(), mock.Deployment() 1610 d1.Status = structs.DeploymentStatusFailed 1611 d3.Status = structs.DeploymentStatusSuccessful 1612 assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment") 1613 assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment") 1614 assert.Nil(state.UpsertDeployment(1002, d3), "UpsertDeployment") 1615 1616 a := mock.Alloc() 1617 a.JobID = d3.JobID 1618 a.DeploymentID = d3.ID 1619 assert.Nil(state.UpsertAllocs(1003, []*structs.Allocation{a}), "UpsertAllocs") 1620 1621 // Update the time tables to make this work 1622 tt := s1.fsm.TimeTable() 1623 tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.DeploymentGCThreshold)) 1624 1625 // Create a core scheduler 1626 snap, err := state.Snapshot() 1627 assert.Nil(err, "Snapshot") 1628 core := NewCoreScheduler(s1, snap) 1629 1630 // Attempt the GC 1631 gc := s1.coreJobEval(structs.CoreJobDeploymentGC, 2000) 1632 assert.Nil(core.Process(gc), "Process GC") 1633 1634 // Should be gone 1635 ws := memdb.NewWatchSet() 1636 out, err := state.DeploymentByID(ws, d1.ID) 1637 assert.Nil(err, "DeploymentByID") 1638 assert.Nil(out, "Terminal Deployment") 1639 out2, err := state.DeploymentByID(ws, d2.ID) 1640 assert.Nil(err, "DeploymentByID") 1641 assert.NotNil(out2, "Active Deployment") 1642 out3, err := state.DeploymentByID(ws, d3.ID) 1643 assert.Nil(err, "DeploymentByID") 1644 assert.NotNil(out3, "Terminal Deployment With Allocs") 1645 } 1646 1647 func TestCoreScheduler_DeploymentGC_Force(t *testing.T) { 1648 t.Parallel() 1649 for _, withAcl := range []bool{false, true} { 1650 t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { 1651 var server *Server 1652 if withAcl { 1653 server, _ = TestACLServer(t, nil) 1654 } else { 1655 server = TestServer(t, nil) 1656 } 1657 defer server.Shutdown() 1658 testutil.WaitForLeader(t, server.RPC) 1659 assert := assert.New(t) 1660 1661 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1662 server.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1663 1664 // Insert terminal and active deployment 1665 state := server.fsm.State() 1666 d1, d2 := mock.Deployment(), mock.Deployment() 1667 d1.Status = structs.DeploymentStatusFailed 1668 assert.Nil(state.UpsertDeployment(1000, d1), "UpsertDeployment") 1669 assert.Nil(state.UpsertDeployment(1001, d2), "UpsertDeployment") 1670 1671 // Create a core scheduler 1672 snap, err := state.Snapshot() 1673 assert.Nil(err, "Snapshot") 1674 core := NewCoreScheduler(server, snap) 1675 1676 // Attempt the GC 1677 gc := server.coreJobEval(structs.CoreJobForceGC, 1000) 1678 assert.Nil(core.Process(gc), "Process Force GC") 1679 1680 // Should be gone 1681 ws := memdb.NewWatchSet() 1682 out, err := state.DeploymentByID(ws, d1.ID) 1683 assert.Nil(err, "DeploymentByID") 1684 assert.Nil(out, "Terminal Deployment") 1685 out2, err := state.DeploymentByID(ws, d2.ID) 1686 assert.Nil(err, "DeploymentByID") 1687 assert.NotNil(out2, "Active Deployment") 1688 }) 1689 } 1690 } 1691 1692 func TestCoreScheduler_PartitionEvalReap(t *testing.T) { 1693 t.Parallel() 1694 s1 := TestServer(t, nil) 1695 defer s1.Shutdown() 1696 testutil.WaitForLeader(t, s1.RPC) 1697 1698 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1699 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1700 1701 // Create a core scheduler 1702 snap, err := s1.fsm.State().Snapshot() 1703 if err != nil { 1704 t.Fatalf("err: %v", err) 1705 } 1706 core := NewCoreScheduler(s1, snap) 1707 1708 // Set the max ids per reap to something lower. 1709 maxIdsPerReap = 2 1710 1711 evals := []string{"a", "b", "c"} 1712 allocs := []string{"1", "2", "3"} 1713 requests := core.(*CoreScheduler).partitionEvalReap(evals, allocs) 1714 if len(requests) != 3 { 1715 t.Fatalf("Expected 3 requests got: %v", requests) 1716 } 1717 1718 first := requests[0] 1719 if len(first.Allocs) != 2 && len(first.Evals) != 0 { 1720 t.Fatalf("Unexpected first request: %v", first) 1721 } 1722 1723 second := requests[1] 1724 if len(second.Allocs) != 1 && len(second.Evals) != 1 { 1725 t.Fatalf("Unexpected second request: %v", second) 1726 } 1727 1728 third := requests[2] 1729 if len(third.Allocs) != 0 && len(third.Evals) != 2 { 1730 t.Fatalf("Unexpected third request: %v", third) 1731 } 1732 } 1733 1734 func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) { 1735 t.Parallel() 1736 s1 := TestServer(t, nil) 1737 defer s1.Shutdown() 1738 testutil.WaitForLeader(t, s1.RPC) 1739 1740 // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 1741 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) 1742 1743 // Create a core scheduler 1744 snap, err := s1.fsm.State().Snapshot() 1745 if err != nil { 1746 t.Fatalf("err: %v", err) 1747 } 1748 core := NewCoreScheduler(s1, snap) 1749 1750 // Set the max ids per reap to something lower. 1751 maxIdsPerReap = 2 1752 1753 deployments := []string{"a", "b", "c"} 1754 requests := core.(*CoreScheduler).partitionDeploymentReap(deployments) 1755 if len(requests) != 2 { 1756 t.Fatalf("Expected 2 requests got: %v", requests) 1757 } 1758 1759 first := requests[0] 1760 if len(first.Deployments) != 2 { 1761 t.Fatalf("Unexpected first request: %v", first) 1762 } 1763 1764 second := requests[1] 1765 if len(second.Deployments) != 1 { 1766 t.Fatalf("Unexpected second request: %v", second) 1767 } 1768 } 1769 1770 // Tests various scenarios when allocations are eligible to be GCed 1771 func TestAllocation_GCEligible(t *testing.T) { 1772 type testCase struct { 1773 Desc string 1774 GCTime time.Time 1775 ClientStatus string 1776 DesiredStatus string 1777 JobStatus string 1778 JobStop bool 1779 ModifyIndex uint64 1780 NextAllocID string 1781 ReschedulePolicy *structs.ReschedulePolicy 1782 RescheduleTrackers []*structs.RescheduleEvent 1783 ThresholdIndex uint64 1784 ShouldGC bool 1785 } 1786 1787 fail := time.Now() 1788 1789 harness := []testCase{ 1790 { 1791 Desc: "GC when non terminal", 1792 ClientStatus: structs.AllocClientStatusPending, 1793 DesiredStatus: structs.AllocDesiredStatusRun, 1794 GCTime: fail, 1795 ModifyIndex: 90, 1796 ThresholdIndex: 90, 1797 ShouldGC: false, 1798 }, 1799 { 1800 Desc: "GC when non terminal and job stopped", 1801 ClientStatus: structs.AllocClientStatusPending, 1802 DesiredStatus: structs.AllocDesiredStatusRun, 1803 JobStop: true, 1804 GCTime: fail, 1805 ModifyIndex: 90, 1806 ThresholdIndex: 90, 1807 ShouldGC: false, 1808 }, 1809 { 1810 Desc: "GC when non terminal and job dead", 1811 ClientStatus: structs.AllocClientStatusPending, 1812 DesiredStatus: structs.AllocDesiredStatusRun, 1813 JobStatus: structs.JobStatusDead, 1814 GCTime: fail, 1815 ModifyIndex: 90, 1816 ThresholdIndex: 90, 1817 ShouldGC: false, 1818 }, 1819 { 1820 Desc: "GC when threshold not met", 1821 ClientStatus: structs.AllocClientStatusComplete, 1822 DesiredStatus: structs.AllocDesiredStatusStop, 1823 GCTime: fail, 1824 ModifyIndex: 100, 1825 ThresholdIndex: 90, 1826 ReschedulePolicy: nil, 1827 ShouldGC: false, 1828 }, 1829 { 1830 Desc: "GC when no reschedule policy", 1831 ClientStatus: structs.AllocClientStatusFailed, 1832 DesiredStatus: structs.AllocDesiredStatusRun, 1833 GCTime: fail, 1834 ReschedulePolicy: nil, 1835 ModifyIndex: 90, 1836 ThresholdIndex: 90, 1837 ShouldGC: true, 1838 }, 1839 { 1840 Desc: "GC when empty policy", 1841 ClientStatus: structs.AllocClientStatusFailed, 1842 DesiredStatus: structs.AllocDesiredStatusRun, 1843 GCTime: fail, 1844 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 0, Interval: 0 * time.Minute}, 1845 ModifyIndex: 90, 1846 ThresholdIndex: 90, 1847 ShouldGC: true, 1848 }, 1849 { 1850 Desc: "GC with no previous attempts", 1851 ClientStatus: structs.AllocClientStatusFailed, 1852 DesiredStatus: structs.AllocDesiredStatusRun, 1853 GCTime: fail, 1854 ModifyIndex: 90, 1855 ThresholdIndex: 90, 1856 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 1, Interval: 1 * time.Minute}, 1857 ShouldGC: false, 1858 }, 1859 { 1860 Desc: "GC with prev reschedule attempt within interval", 1861 ClientStatus: structs.AllocClientStatusFailed, 1862 DesiredStatus: structs.AllocDesiredStatusRun, 1863 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute}, 1864 GCTime: fail, 1865 ModifyIndex: 90, 1866 ThresholdIndex: 90, 1867 RescheduleTrackers: []*structs.RescheduleEvent{ 1868 { 1869 RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(), 1870 }, 1871 }, 1872 ShouldGC: false, 1873 }, 1874 { 1875 Desc: "GC with prev reschedule attempt outside interval", 1876 ClientStatus: structs.AllocClientStatusFailed, 1877 DesiredStatus: structs.AllocDesiredStatusRun, 1878 GCTime: fail, 1879 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 1880 RescheduleTrackers: []*structs.RescheduleEvent{ 1881 { 1882 RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(), 1883 }, 1884 { 1885 RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(), 1886 }, 1887 }, 1888 ShouldGC: true, 1889 }, 1890 { 1891 Desc: "GC when next alloc id is set", 1892 ClientStatus: structs.AllocClientStatusFailed, 1893 DesiredStatus: structs.AllocDesiredStatusRun, 1894 GCTime: fail, 1895 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 1896 RescheduleTrackers: []*structs.RescheduleEvent{ 1897 { 1898 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 1899 }, 1900 }, 1901 NextAllocID: uuid.Generate(), 1902 ShouldGC: true, 1903 }, 1904 { 1905 Desc: "GC when job is stopped", 1906 ClientStatus: structs.AllocClientStatusFailed, 1907 DesiredStatus: structs.AllocDesiredStatusRun, 1908 GCTime: fail, 1909 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 1910 RescheduleTrackers: []*structs.RescheduleEvent{ 1911 { 1912 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 1913 }, 1914 }, 1915 JobStop: true, 1916 ShouldGC: true, 1917 }, 1918 { 1919 Desc: "GC when job status is dead", 1920 ClientStatus: structs.AllocClientStatusFailed, 1921 DesiredStatus: structs.AllocDesiredStatusRun, 1922 GCTime: fail, 1923 ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 5, Interval: 30 * time.Minute}, 1924 RescheduleTrackers: []*structs.RescheduleEvent{ 1925 { 1926 RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), 1927 }, 1928 }, 1929 JobStatus: structs.JobStatusDead, 1930 ShouldGC: true, 1931 }, 1932 } 1933 1934 for _, tc := range harness { 1935 alloc := &structs.Allocation{} 1936 alloc.ModifyIndex = tc.ModifyIndex 1937 alloc.DesiredStatus = tc.DesiredStatus 1938 alloc.ClientStatus = tc.ClientStatus 1939 alloc.RescheduleTracker = &structs.RescheduleTracker{Events: tc.RescheduleTrackers} 1940 alloc.NextAllocation = tc.NextAllocID 1941 job := mock.Job() 1942 alloc.TaskGroup = job.TaskGroups[0].Name 1943 job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy 1944 if tc.JobStatus != "" { 1945 job.Status = tc.JobStatus 1946 } 1947 job.Stop = tc.JobStop 1948 1949 t.Run(tc.Desc, func(t *testing.T) { 1950 if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC { 1951 t.Fatalf("expected %v but got %v", tc.ShouldGC, got) 1952 } 1953 }) 1954 1955 } 1956 1957 // Verify nil job 1958 require := require.New(t) 1959 alloc := mock.Alloc() 1960 alloc.ClientStatus = structs.AllocClientStatusComplete 1961 require.True(allocGCEligible(alloc, nil, time.Now(), 1000)) 1962 }