github.com/manicqin/nomad@v0.9.5/nomad/drainer_int_test.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "net/rpc" 7 "testing" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 memdb "github.com/hashicorp/go-memdb" 12 msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" 13 14 "github.com/hashicorp/nomad/helper" 15 "github.com/hashicorp/nomad/helper/uuid" 16 "github.com/hashicorp/nomad/nomad/drainer" 17 "github.com/hashicorp/nomad/nomad/mock" 18 "github.com/hashicorp/nomad/nomad/state" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/nomad/testutil" 21 "github.com/stretchr/testify/require" 22 ) 23 24 func allocPromoter(errCh chan<- error, ctx context.Context, 25 state *state.StateStore, codec rpc.ClientCodec, nodeID string, 26 logger log.Logger) { 27 28 nindex := uint64(1) 29 for { 30 allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex) 31 if err != nil { 32 if err == context.Canceled { 33 return 34 } 35 36 errCh <- fmt.Errorf("failed to get node allocs: %v", err) 37 return 38 } 39 nindex = index 40 41 // For each alloc that doesn't have its deployment status set, set it 42 var updates []*structs.Allocation 43 now := time.Now() 44 for _, alloc := range allocs { 45 if alloc.Job.Type != structs.JobTypeService { 46 continue 47 } 48 49 if alloc.DeploymentStatus.HasHealth() { 50 continue 51 } 52 newAlloc := alloc.Copy() 53 newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{ 54 Healthy: helper.BoolToPtr(true), 55 Timestamp: now, 56 } 57 updates = append(updates, newAlloc) 58 logger.Trace("marked deployment health for alloc", "alloc_id", alloc.ID) 59 } 60 61 if len(updates) == 0 { 62 continue 63 } 64 65 // Send the update 66 req := &structs.AllocUpdateRequest{ 67 Alloc: updates, 68 WriteRequest: structs.WriteRequest{Region: "global"}, 69 } 70 var resp structs.GenericResponse 71 if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil { 72 if ctx.Err() == context.Canceled { 73 return 74 } else if err != nil { 75 errCh <- err 76 } 77 } 78 } 79 } 80 81 // checkAllocPromoter is a small helper to return an error or nil from an error 82 // chan like the one given to the allocPromoter goroutine. 83 func checkAllocPromoter(errCh chan error) error { 84 select { 85 case err := <-errCh: 86 return err 87 default: 88 return nil 89 } 90 } 91 92 func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) { 93 resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx) 94 if err != nil { 95 return nil, 0, err 96 } 97 if err := ctx.Err(); err != nil { 98 return nil, 0, err 99 } 100 101 return resp.([]*structs.Allocation), index, nil 102 } 103 104 func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 105 return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 106 // Capture all the allocations 107 allocs, err := state.AllocsByNode(ws, nodeID) 108 if err != nil { 109 return nil, 0, err 110 } 111 112 // Use the last index that affected the jobs table 113 index, err := state.Index("allocs") 114 if err != nil { 115 return nil, index, err 116 } 117 118 return allocs, index, nil 119 } 120 } 121 122 func TestDrainer_Simple_ServiceOnly(t *testing.T) { 123 t.Parallel() 124 require := require.New(t) 125 126 s1, cleanupS1 := TestServer(t, nil) 127 defer cleanupS1() 128 codec := rpcClient(t, s1) 129 testutil.WaitForLeader(t, s1.RPC) 130 131 // Create two nodes 132 n1, n2 := mock.Node(), mock.Node() 133 nodeReg := &structs.NodeRegisterRequest{ 134 Node: n1, 135 WriteRequest: structs.WriteRequest{Region: "global"}, 136 } 137 var nodeResp structs.NodeUpdateResponse 138 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 139 140 // Create a job that runs on just one 141 job := mock.Job() 142 job.TaskGroups[0].Count = 2 143 req := &structs.JobRegisterRequest{ 144 Job: job, 145 WriteRequest: structs.WriteRequest{ 146 Region: "global", 147 Namespace: job.Namespace, 148 }, 149 } 150 151 // Fetch the response 152 var resp structs.JobRegisterResponse 153 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 154 require.NotZero(resp.Index) 155 156 // Wait for the two allocations to be placed 157 state := s1.State() 158 testutil.WaitForResult(func() (bool, error) { 159 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 160 if err != nil { 161 return false, err 162 } 163 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 164 }, func(err error) { 165 t.Fatalf("err: %v", err) 166 }) 167 168 // Create the second node 169 nodeReg = &structs.NodeRegisterRequest{ 170 Node: n2, 171 WriteRequest: structs.WriteRequest{Region: "global"}, 172 } 173 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 174 175 // Drain the first node 176 drainReq := &structs.NodeUpdateDrainRequest{ 177 NodeID: n1.ID, 178 DrainStrategy: &structs.DrainStrategy{ 179 DrainSpec: structs.DrainSpec{ 180 Deadline: 10 * time.Minute, 181 }, 182 }, 183 WriteRequest: structs.WriteRequest{Region: "global"}, 184 } 185 var drainResp structs.NodeDrainUpdateResponse 186 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 187 188 // Wait for the allocs to be replaced 189 errCh := make(chan error, 2) 190 ctx, cancel := context.WithCancel(context.Background()) 191 defer cancel() 192 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 193 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 194 195 testutil.WaitForResult(func() (bool, error) { 196 allocs, err := state.AllocsByNode(nil, n2.ID) 197 if err != nil { 198 return false, err 199 } 200 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 201 }, func(err error) { 202 t.Fatalf("err: %v", err) 203 }) 204 205 // Check that the node drain is removed 206 testutil.WaitForResult(func() (bool, error) { 207 if err := checkAllocPromoter(errCh); err != nil { 208 return false, err 209 } 210 node, err := state.NodeByID(nil, n1.ID) 211 if err != nil { 212 return false, err 213 } 214 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 215 }, func(err error) { 216 t.Fatalf("err: %v", err) 217 }) 218 219 // Check we got the right events 220 node, err := state.NodeByID(nil, n1.ID) 221 require.NoError(err) 222 require.Len(node.Events, 3) 223 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 224 } 225 226 func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) { 227 t.Parallel() 228 require := require.New(t) 229 230 s1, cleanupS1 := TestServer(t, nil) 231 defer cleanupS1() 232 codec := rpcClient(t, s1) 233 testutil.WaitForLeader(t, s1.RPC) 234 235 // Create a node 236 n1 := mock.Node() 237 nodeReg := &structs.NodeRegisterRequest{ 238 Node: n1, 239 WriteRequest: structs.WriteRequest{Region: "global"}, 240 } 241 var nodeResp structs.NodeUpdateResponse 242 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 243 244 // Create a job that runs on just one 245 job := mock.Job() 246 job.Update = *structs.DefaultUpdateStrategy 247 job.Update.Stagger = 30 * time.Second 248 job.TaskGroups[0].Count = 2 249 req := &structs.JobRegisterRequest{ 250 Job: job, 251 WriteRequest: structs.WriteRequest{ 252 Region: "global", 253 Namespace: job.Namespace, 254 }, 255 } 256 257 // Fetch the response 258 var resp structs.JobRegisterResponse 259 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 260 require.NotZero(resp.Index) 261 262 // Wait for the two allocations to be placed 263 state := s1.State() 264 testutil.WaitForResult(func() (bool, error) { 265 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 266 if err != nil { 267 return false, err 268 } 269 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 270 }, func(err error) { 271 t.Fatalf("err: %v", err) 272 }) 273 274 // Drain the node 275 drainReq := &structs.NodeUpdateDrainRequest{ 276 NodeID: n1.ID, 277 DrainStrategy: &structs.DrainStrategy{ 278 DrainSpec: structs.DrainSpec{ 279 Deadline: 1 * time.Second, 280 }, 281 }, 282 WriteRequest: structs.WriteRequest{Region: "global"}, 283 } 284 var drainResp structs.NodeDrainUpdateResponse 285 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 286 287 // Wait for the allocs to be stopped 288 testutil.WaitForResult(func() (bool, error) { 289 allocs, err := state.AllocsByNode(nil, n1.ID) 290 if err != nil { 291 return false, err 292 } 293 for _, alloc := range allocs { 294 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 295 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 296 } 297 } 298 return true, nil 299 }, func(err error) { 300 t.Fatalf("err: %v", err) 301 }) 302 303 // Check that the node drain is removed 304 testutil.WaitForResult(func() (bool, error) { 305 node, err := state.NodeByID(nil, n1.ID) 306 if err != nil { 307 return false, err 308 } 309 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 310 }, func(err error) { 311 t.Fatalf("err: %v", err) 312 }) 313 314 // Check we got the right events 315 node, err := state.NodeByID(nil, n1.ID) 316 require.NoError(err) 317 require.Len(node.Events, 3) 318 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 319 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 320 } 321 322 func TestDrainer_DrainEmptyNode(t *testing.T) { 323 t.Parallel() 324 require := require.New(t) 325 326 s1, cleanupS1 := TestServer(t, nil) 327 defer cleanupS1() 328 codec := rpcClient(t, s1) 329 testutil.WaitForLeader(t, s1.RPC) 330 331 // Create a node 332 n1 := mock.Node() 333 nodeReg := &structs.NodeRegisterRequest{ 334 Node: n1, 335 WriteRequest: structs.WriteRequest{Region: "global"}, 336 } 337 var nodeResp structs.NodeUpdateResponse 338 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 339 340 // Drain the node 341 drainReq := &structs.NodeUpdateDrainRequest{ 342 NodeID: n1.ID, 343 DrainStrategy: &structs.DrainStrategy{ 344 DrainSpec: structs.DrainSpec{ 345 Deadline: 10 * time.Minute, 346 }, 347 }, 348 WriteRequest: structs.WriteRequest{Region: "global"}, 349 } 350 var drainResp structs.NodeDrainUpdateResponse 351 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 352 353 // Check that the node drain is removed 354 state := s1.State() 355 testutil.WaitForResult(func() (bool, error) { 356 node, err := state.NodeByID(nil, n1.ID) 357 if err != nil { 358 return false, err 359 } 360 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 361 }, func(err error) { 362 t.Fatalf("err: %v", err) 363 }) 364 365 // Check we got the right events 366 node, err := state.NodeByID(nil, n1.ID) 367 require.NoError(err) 368 require.Len(node.Events, 3) 369 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 370 } 371 372 func TestDrainer_AllTypes_Deadline(t *testing.T) { 373 t.Parallel() 374 require := require.New(t) 375 376 s1, cleanupS1 := TestServer(t, nil) 377 defer cleanupS1() 378 codec := rpcClient(t, s1) 379 testutil.WaitForLeader(t, s1.RPC) 380 381 // Create two nodes, registering the second later 382 n1, n2 := mock.Node(), mock.Node() 383 nodeReg := &structs.NodeRegisterRequest{ 384 Node: n1, 385 WriteRequest: structs.WriteRequest{Region: "global"}, 386 } 387 var nodeResp structs.NodeUpdateResponse 388 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 389 390 // Create a service job that runs on just one 391 job := mock.Job() 392 job.TaskGroups[0].Count = 2 393 req := &structs.JobRegisterRequest{ 394 Job: job, 395 WriteRequest: structs.WriteRequest{ 396 Region: "global", 397 Namespace: job.Namespace, 398 }, 399 } 400 401 // Fetch the response 402 var resp structs.JobRegisterResponse 403 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 404 require.NotZero(resp.Index) 405 406 // Create a system job 407 sysjob := mock.SystemJob() 408 req = &structs.JobRegisterRequest{ 409 Job: sysjob, 410 WriteRequest: structs.WriteRequest{ 411 Region: "global", 412 Namespace: job.Namespace, 413 }, 414 } 415 416 // Fetch the response 417 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 418 require.NotZero(resp.Index) 419 420 // Create a batch job 421 bjob := mock.BatchJob() 422 bjob.TaskGroups[0].Count = 2 423 req = &structs.JobRegisterRequest{ 424 Job: bjob, 425 WriteRequest: structs.WriteRequest{ 426 Region: "global", 427 Namespace: job.Namespace, 428 }, 429 } 430 431 // Fetch the response 432 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 433 require.NotZero(resp.Index) 434 435 // Wait for the allocations to be placed 436 state := s1.State() 437 testutil.WaitForResult(func() (bool, error) { 438 allocs, err := state.AllocsByNode(nil, n1.ID) 439 if err != nil { 440 return false, err 441 } 442 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 443 }, func(err error) { 444 t.Fatalf("err: %v", err) 445 }) 446 447 // Create the second node 448 nodeReg = &structs.NodeRegisterRequest{ 449 Node: n2, 450 WriteRequest: structs.WriteRequest{Region: "global"}, 451 } 452 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 453 454 // Drain the node 455 drainReq := &structs.NodeUpdateDrainRequest{ 456 NodeID: n1.ID, 457 DrainStrategy: &structs.DrainStrategy{ 458 DrainSpec: structs.DrainSpec{ 459 Deadline: 2 * time.Second, 460 }, 461 }, 462 WriteRequest: structs.WriteRequest{Region: "global"}, 463 } 464 var drainResp structs.NodeDrainUpdateResponse 465 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 466 467 // Wait for the allocs to be replaced 468 errCh := make(chan error, 2) 469 ctx, cancel := context.WithCancel(context.Background()) 470 defer cancel() 471 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 472 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 473 474 // Wait for the allocs to be stopped 475 var finalAllocs []*structs.Allocation 476 testutil.WaitForResult(func() (bool, error) { 477 if err := checkAllocPromoter(errCh); err != nil { 478 return false, err 479 } 480 481 var err error 482 finalAllocs, err = state.AllocsByNode(nil, n1.ID) 483 if err != nil { 484 return false, err 485 } 486 for _, alloc := range finalAllocs { 487 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 488 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 489 } 490 } 491 return true, nil 492 }, func(err error) { 493 t.Fatalf("err: %v", err) 494 }) 495 496 // Check that the node drain is removed 497 testutil.WaitForResult(func() (bool, error) { 498 node, err := state.NodeByID(nil, n1.ID) 499 if err != nil { 500 return false, err 501 } 502 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 503 }, func(err error) { 504 t.Fatalf("err: %v", err) 505 }) 506 507 // Wait for the allocations to be placed on the other node 508 testutil.WaitForResult(func() (bool, error) { 509 allocs, err := state.AllocsByNode(nil, n2.ID) 510 if err != nil { 511 return false, err 512 } 513 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 514 }, func(err error) { 515 t.Fatalf("err: %v", err) 516 }) 517 518 // Assert that the service finished before the batch and system 519 var serviceMax, batchMax uint64 = 0, 0 520 for _, alloc := range finalAllocs { 521 if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax { 522 serviceMax = alloc.ModifyIndex 523 } else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax { 524 batchMax = alloc.ModifyIndex 525 } 526 } 527 require.True(serviceMax < batchMax) 528 529 // Check we got the right events 530 node, err := state.NodeByID(nil, n1.ID) 531 require.NoError(err) 532 require.Len(node.Events, 3) 533 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 534 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 535 } 536 537 // Test that drain is unset when batch jobs naturally finish 538 func TestDrainer_AllTypes_NoDeadline(t *testing.T) { 539 t.Parallel() 540 require := require.New(t) 541 542 s1, cleanupS1 := TestServer(t, nil) 543 defer cleanupS1() 544 codec := rpcClient(t, s1) 545 testutil.WaitForLeader(t, s1.RPC) 546 547 // Create two nodes, registering the second later 548 n1, n2 := mock.Node(), mock.Node() 549 nodeReg := &structs.NodeRegisterRequest{ 550 Node: n1, 551 WriteRequest: structs.WriteRequest{Region: "global"}, 552 } 553 var nodeResp structs.NodeUpdateResponse 554 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 555 556 // Create a service job that runs on just one 557 job := mock.Job() 558 job.TaskGroups[0].Count = 2 559 req := &structs.JobRegisterRequest{ 560 Job: job, 561 WriteRequest: structs.WriteRequest{ 562 Region: "global", 563 Namespace: job.Namespace, 564 }, 565 } 566 567 // Fetch the response 568 var resp structs.JobRegisterResponse 569 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 570 require.NotZero(resp.Index) 571 572 // Create a system job 573 sysjob := mock.SystemJob() 574 req = &structs.JobRegisterRequest{ 575 Job: sysjob, 576 WriteRequest: structs.WriteRequest{ 577 Region: "global", 578 Namespace: job.Namespace, 579 }, 580 } 581 582 // Fetch the response 583 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 584 require.NotZero(resp.Index) 585 586 // Create a batch job 587 bjob := mock.BatchJob() 588 bjob.TaskGroups[0].Count = 2 589 req = &structs.JobRegisterRequest{ 590 Job: bjob, 591 WriteRequest: structs.WriteRequest{ 592 Region: "global", 593 Namespace: job.Namespace, 594 }, 595 } 596 597 // Fetch the response 598 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 599 require.NotZero(resp.Index) 600 601 // Wait for the allocations to be placed 602 state := s1.State() 603 testutil.WaitForResult(func() (bool, error) { 604 allocs, err := state.AllocsByNode(nil, n1.ID) 605 if err != nil { 606 return false, err 607 } 608 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 609 }, func(err error) { 610 t.Fatalf("err: %v", err) 611 }) 612 613 // Create the second node 614 nodeReg = &structs.NodeRegisterRequest{ 615 Node: n2, 616 WriteRequest: structs.WriteRequest{Region: "global"}, 617 } 618 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 619 620 // Drain the node 621 drainReq := &structs.NodeUpdateDrainRequest{ 622 NodeID: n1.ID, 623 DrainStrategy: &structs.DrainStrategy{ 624 DrainSpec: structs.DrainSpec{ 625 Deadline: 0 * time.Second, // Infinite 626 }, 627 }, 628 WriteRequest: structs.WriteRequest{Region: "global"}, 629 } 630 var drainResp structs.NodeDrainUpdateResponse 631 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 632 633 // Wait for the allocs to be replaced 634 errCh := make(chan error, 2) 635 ctx, cancel := context.WithCancel(context.Background()) 636 defer cancel() 637 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 638 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 639 640 // Wait for the service allocs to be stopped on the draining node 641 testutil.WaitForResult(func() (bool, error) { 642 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 643 if err != nil { 644 return false, err 645 } 646 for _, alloc := range allocs { 647 if alloc.NodeID != n1.ID { 648 continue 649 } 650 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 651 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 652 } 653 } 654 if err := checkAllocPromoter(errCh); err != nil { 655 return false, err 656 } 657 return true, nil 658 }, func(err error) { 659 t.Fatalf("err: %v", err) 660 }) 661 662 // Mark the batch allocations as finished 663 allocs, err := state.AllocsByJob(nil, job.Namespace, bjob.ID, false) 664 require.Nil(err) 665 666 var updates []*structs.Allocation 667 for _, alloc := range allocs { 668 new := alloc.Copy() 669 new.ClientStatus = structs.AllocClientStatusComplete 670 updates = append(updates, new) 671 } 672 require.Nil(state.UpdateAllocsFromClient(1000, updates)) 673 674 // Check that the node drain is removed 675 testutil.WaitForResult(func() (bool, error) { 676 node, err := state.NodeByID(nil, n1.ID) 677 if err != nil { 678 return false, err 679 } 680 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 681 }, func(err error) { 682 t.Fatalf("err: %v", err) 683 }) 684 685 // Wait for the service allocations to be placed on the other node 686 testutil.WaitForResult(func() (bool, error) { 687 allocs, err := state.AllocsByNode(nil, n2.ID) 688 if err != nil { 689 return false, err 690 } 691 return len(allocs) == 3, fmt.Errorf("got %d allocs", len(allocs)) 692 }, func(err error) { 693 t.Fatalf("err: %v", err) 694 }) 695 696 // Check we got the right events 697 node, err := state.NodeByID(nil, n1.ID) 698 require.NoError(err) 699 require.Len(node.Events, 3) 700 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 701 } 702 703 func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) { 704 t.Parallel() 705 require := require.New(t) 706 707 s1, cleanupS1 := TestServer(t, nil) 708 defer cleanupS1() 709 codec := rpcClient(t, s1) 710 testutil.WaitForLeader(t, s1.RPC) 711 712 // Create two nodes, registering the second later 713 n1, n2 := mock.Node(), mock.Node() 714 nodeReg := &structs.NodeRegisterRequest{ 715 Node: n1, 716 WriteRequest: structs.WriteRequest{Region: "global"}, 717 } 718 var nodeResp structs.NodeUpdateResponse 719 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 720 721 // Create a service job that runs on just one 722 job := mock.Job() 723 job.TaskGroups[0].Count = 2 724 req := &structs.JobRegisterRequest{ 725 Job: job, 726 WriteRequest: structs.WriteRequest{ 727 Region: "global", 728 Namespace: job.Namespace, 729 }, 730 } 731 732 // Fetch the response 733 var resp structs.JobRegisterResponse 734 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 735 require.NotZero(resp.Index) 736 job.CreateIndex = resp.JobModifyIndex 737 738 // Create a system job 739 sysjob := mock.SystemJob() 740 req = &structs.JobRegisterRequest{ 741 Job: sysjob, 742 WriteRequest: structs.WriteRequest{ 743 Region: "global", 744 Namespace: job.Namespace, 745 }, 746 } 747 748 // Fetch the response 749 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 750 require.NotZero(resp.Index) 751 sysjob.CreateIndex = resp.JobModifyIndex 752 753 // Create a batch job 754 bjob := mock.BatchJob() 755 bjob.TaskGroups[0].Count = 2 756 req = &structs.JobRegisterRequest{ 757 Job: bjob, 758 WriteRequest: structs.WriteRequest{ 759 Region: "global", 760 Namespace: job.Namespace, 761 }, 762 } 763 764 // Fetch the response 765 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 766 require.NotZero(resp.Index) 767 bjob.CreateIndex = resp.JobModifyIndex 768 769 // Wait for the allocations to be placed 770 state := s1.State() 771 testutil.WaitForResult(func() (bool, error) { 772 allocs, err := state.AllocsByNode(nil, n1.ID) 773 if err != nil { 774 return false, err 775 } 776 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 777 }, func(err error) { 778 t.Fatalf("err: %v", err) 779 }) 780 781 // Create some old terminal allocs for each job that point at a non-existent 782 // node to simulate it being on a GC'd node. 783 var badAllocs []*structs.Allocation 784 for _, job := range []*structs.Job{job, sysjob, bjob} { 785 alloc := mock.Alloc() 786 alloc.Namespace = job.Namespace 787 alloc.Job = job 788 alloc.JobID = job.ID 789 alloc.NodeID = uuid.Generate() 790 alloc.TaskGroup = job.TaskGroups[0].Name 791 alloc.DesiredStatus = structs.AllocDesiredStatusStop 792 alloc.ClientStatus = structs.AllocClientStatusComplete 793 badAllocs = append(badAllocs, alloc) 794 } 795 require.NoError(state.UpsertAllocs(1, badAllocs)) 796 797 // Create the second node 798 nodeReg = &structs.NodeRegisterRequest{ 799 Node: n2, 800 WriteRequest: structs.WriteRequest{Region: "global"}, 801 } 802 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 803 804 // Drain the node 805 drainReq := &structs.NodeUpdateDrainRequest{ 806 NodeID: n1.ID, 807 DrainStrategy: &structs.DrainStrategy{ 808 DrainSpec: structs.DrainSpec{ 809 Deadline: 2 * time.Second, 810 }, 811 }, 812 WriteRequest: structs.WriteRequest{Region: "global"}, 813 } 814 var drainResp structs.NodeDrainUpdateResponse 815 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 816 817 // Wait for the allocs to be replaced 818 errCh := make(chan error, 2) 819 ctx, cancel := context.WithCancel(context.Background()) 820 defer cancel() 821 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 822 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 823 824 // Wait for the allocs to be stopped 825 var finalAllocs []*structs.Allocation 826 testutil.WaitForResult(func() (bool, error) { 827 if err := checkAllocPromoter(errCh); err != nil { 828 return false, err 829 } 830 831 var err error 832 finalAllocs, err = state.AllocsByNode(nil, n1.ID) 833 if err != nil { 834 return false, err 835 } 836 for _, alloc := range finalAllocs { 837 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 838 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 839 } 840 } 841 return true, nil 842 }, func(err error) { 843 t.Fatalf("err: %v", err) 844 }) 845 846 // Check that the node drain is removed 847 testutil.WaitForResult(func() (bool, error) { 848 node, err := state.NodeByID(nil, n1.ID) 849 if err != nil { 850 return false, err 851 } 852 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 853 }, func(err error) { 854 t.Fatalf("err: %v", err) 855 }) 856 857 // Wait for the allocations to be placed on the other node 858 testutil.WaitForResult(func() (bool, error) { 859 allocs, err := state.AllocsByNode(nil, n2.ID) 860 if err != nil { 861 return false, err 862 } 863 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 864 }, func(err error) { 865 t.Fatalf("err: %v", err) 866 }) 867 868 // Check we got the right events 869 node, err := state.NodeByID(nil, n1.ID) 870 require.NoError(err) 871 require.Len(node.Events, 3) 872 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 873 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 874 } 875 876 // Test that transitions to force drain work. 877 func TestDrainer_Batch_TransitionToForce(t *testing.T) { 878 t.Parallel() 879 880 for _, inf := range []bool{true, false} { 881 name := "Infinite" 882 if !inf { 883 name = "Deadline" 884 } 885 t.Run(name, func(t *testing.T) { 886 require := require.New(t) 887 s1, cleanupS1 := TestServer(t, nil) 888 defer cleanupS1() 889 codec := rpcClient(t, s1) 890 testutil.WaitForLeader(t, s1.RPC) 891 892 // Create a node 893 n1 := mock.Node() 894 nodeReg := &structs.NodeRegisterRequest{ 895 Node: n1, 896 WriteRequest: structs.WriteRequest{Region: "global"}, 897 } 898 var nodeResp structs.NodeUpdateResponse 899 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 900 901 // Create a batch job 902 bjob := mock.BatchJob() 903 bjob.TaskGroups[0].Count = 2 904 req := &structs.JobRegisterRequest{ 905 Job: bjob, 906 WriteRequest: structs.WriteRequest{ 907 Region: "global", 908 Namespace: bjob.Namespace, 909 }, 910 } 911 912 // Fetch the response 913 var resp structs.JobRegisterResponse 914 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 915 require.NotZero(resp.Index) 916 917 // Wait for the allocations to be placed 918 state := s1.State() 919 testutil.WaitForResult(func() (bool, error) { 920 allocs, err := state.AllocsByNode(nil, n1.ID) 921 if err != nil { 922 return false, err 923 } 924 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 925 }, func(err error) { 926 t.Fatalf("err: %v", err) 927 }) 928 929 // Pick the deadline 930 deadline := 0 * time.Second 931 if !inf { 932 deadline = 10 * time.Second 933 } 934 935 // Drain the node 936 drainReq := &structs.NodeUpdateDrainRequest{ 937 NodeID: n1.ID, 938 DrainStrategy: &structs.DrainStrategy{ 939 DrainSpec: structs.DrainSpec{ 940 Deadline: deadline, 941 }, 942 }, 943 WriteRequest: structs.WriteRequest{Region: "global"}, 944 } 945 var drainResp structs.NodeDrainUpdateResponse 946 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 947 948 // Wait for the allocs to be replaced 949 errCh := make(chan error, 1) 950 ctx, cancel := context.WithCancel(context.Background()) 951 defer cancel() 952 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 953 954 // Make sure the batch job isn't affected 955 testutil.AssertUntil(500*time.Millisecond, func() (bool, error) { 956 if err := checkAllocPromoter(errCh); err != nil { 957 return false, fmt.Errorf("check alloc promoter error: %v", err) 958 } 959 960 allocs, err := state.AllocsByNode(nil, n1.ID) 961 if err != nil { 962 return false, fmt.Errorf("AllocsByNode error: %v", err) 963 } 964 for _, alloc := range allocs { 965 if alloc.DesiredStatus != structs.AllocDesiredStatusRun { 966 return false, fmt.Errorf("got status %v", alloc.DesiredStatus) 967 } 968 } 969 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 970 }, func(err error) { 971 t.Fatalf("err: %v", err) 972 }) 973 974 // Foce drain the node 975 drainReq = &structs.NodeUpdateDrainRequest{ 976 NodeID: n1.ID, 977 DrainStrategy: &structs.DrainStrategy{ 978 DrainSpec: structs.DrainSpec{ 979 Deadline: -1 * time.Second, // Infinite 980 }, 981 }, 982 WriteRequest: structs.WriteRequest{Region: "global"}, 983 } 984 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 985 986 // Make sure the batch job is migrated 987 testutil.WaitForResult(func() (bool, error) { 988 allocs, err := state.AllocsByNode(nil, n1.ID) 989 if err != nil { 990 return false, err 991 } 992 for _, alloc := range allocs { 993 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 994 return false, fmt.Errorf("got status %v", alloc.DesiredStatus) 995 } 996 } 997 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 998 }, func(err error) { 999 t.Fatalf("err: %v", err) 1000 }) 1001 1002 // Check that the node drain is removed 1003 testutil.WaitForResult(func() (bool, error) { 1004 node, err := state.NodeByID(nil, n1.ID) 1005 if err != nil { 1006 return false, err 1007 } 1008 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 1009 }, func(err error) { 1010 t.Fatalf("err: %v", err) 1011 }) 1012 1013 // Check we got the right events 1014 node, err := state.NodeByID(nil, n1.ID) 1015 require.NoError(err) 1016 require.Len(node.Events, 4) 1017 require.Equal(drainer.NodeDrainEventComplete, node.Events[3].Message) 1018 require.Contains(node.Events[3].Details, drainer.NodeDrainEventDetailDeadlined) 1019 }) 1020 } 1021 }