github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/drainer_int_test.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "net/rpc" 7 "testing" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 memdb "github.com/hashicorp/go-memdb" 12 msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" 13 14 "github.com/hashicorp/nomad/helper" 15 "github.com/hashicorp/nomad/helper/uuid" 16 "github.com/hashicorp/nomad/nomad/drainer" 17 "github.com/hashicorp/nomad/nomad/mock" 18 "github.com/hashicorp/nomad/nomad/state" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/nomad/testutil" 21 "github.com/stretchr/testify/require" 22 ) 23 24 func allocPromoter(errCh chan<- error, ctx context.Context, 25 state *state.StateStore, codec rpc.ClientCodec, nodeID string, 26 logger log.Logger) { 27 28 nindex := uint64(1) 29 for { 30 allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex) 31 if err != nil { 32 if err == context.Canceled { 33 return 34 } 35 36 errCh <- fmt.Errorf("failed to get node allocs: %v", err) 37 return 38 } 39 nindex = index 40 41 // For each alloc that doesn't have its deployment status set, set it 42 var updates []*structs.Allocation 43 now := time.Now() 44 for _, alloc := range allocs { 45 if alloc.Job.Type != structs.JobTypeService { 46 continue 47 } 48 49 if alloc.DeploymentStatus.HasHealth() { 50 continue 51 } 52 newAlloc := alloc.Copy() 53 newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{ 54 Healthy: helper.BoolToPtr(true), 55 Timestamp: now, 56 } 57 updates = append(updates, newAlloc) 58 logger.Trace("marked deployment health for alloc", "alloc_id", alloc.ID) 59 } 60 61 if len(updates) == 0 { 62 continue 63 } 64 65 // Send the update 66 req := &structs.AllocUpdateRequest{ 67 Alloc: updates, 68 WriteRequest: structs.WriteRequest{Region: "global"}, 69 } 70 var resp structs.GenericResponse 71 if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil { 72 if ctx.Err() == context.Canceled { 73 return 74 } else if err != nil { 75 errCh <- err 76 } 77 } 78 } 79 } 80 81 // checkAllocPromoter is a small helper to return an error or nil from an error 82 // chan like the one given to the allocPromoter goroutine. 83 func checkAllocPromoter(errCh chan error) error { 84 select { 85 case err := <-errCh: 86 return err 87 default: 88 return nil 89 } 90 } 91 92 func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) { 93 resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx) 94 if err != nil { 95 return nil, 0, err 96 } 97 if err := ctx.Err(); err != nil { 98 return nil, 0, err 99 } 100 101 return resp.([]*structs.Allocation), index, nil 102 } 103 104 func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 105 return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 106 // Capture all the allocations 107 allocs, err := state.AllocsByNode(ws, nodeID) 108 if err != nil { 109 return nil, 0, err 110 } 111 112 // Use the last index that affected the jobs table 113 index, err := state.Index("allocs") 114 if err != nil { 115 return nil, index, err 116 } 117 118 return allocs, index, nil 119 } 120 } 121 122 func TestDrainer_Simple_ServiceOnly(t *testing.T) { 123 t.Parallel() 124 require := require.New(t) 125 s1 := TestServer(t, nil) 126 defer s1.Shutdown() 127 codec := rpcClient(t, s1) 128 testutil.WaitForLeader(t, s1.RPC) 129 130 // Create two nodes 131 n1, n2 := mock.Node(), mock.Node() 132 nodeReg := &structs.NodeRegisterRequest{ 133 Node: n1, 134 WriteRequest: structs.WriteRequest{Region: "global"}, 135 } 136 var nodeResp structs.NodeUpdateResponse 137 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 138 139 // Create a job that runs on just one 140 job := mock.Job() 141 job.TaskGroups[0].Count = 2 142 req := &structs.JobRegisterRequest{ 143 Job: job, 144 WriteRequest: structs.WriteRequest{ 145 Region: "global", 146 Namespace: job.Namespace, 147 }, 148 } 149 150 // Fetch the response 151 var resp structs.JobRegisterResponse 152 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 153 require.NotZero(resp.Index) 154 155 // Wait for the two allocations to be placed 156 state := s1.State() 157 testutil.WaitForResult(func() (bool, error) { 158 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 159 if err != nil { 160 return false, err 161 } 162 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 163 }, func(err error) { 164 t.Fatalf("err: %v", err) 165 }) 166 167 // Create the second node 168 nodeReg = &structs.NodeRegisterRequest{ 169 Node: n2, 170 WriteRequest: structs.WriteRequest{Region: "global"}, 171 } 172 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 173 174 // Drain the first node 175 drainReq := &structs.NodeUpdateDrainRequest{ 176 NodeID: n1.ID, 177 DrainStrategy: &structs.DrainStrategy{ 178 DrainSpec: structs.DrainSpec{ 179 Deadline: 10 * time.Minute, 180 }, 181 }, 182 WriteRequest: structs.WriteRequest{Region: "global"}, 183 } 184 var drainResp structs.NodeDrainUpdateResponse 185 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 186 187 // Wait for the allocs to be replaced 188 errCh := make(chan error, 2) 189 ctx, cancel := context.WithCancel(context.Background()) 190 defer cancel() 191 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 192 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 193 194 testutil.WaitForResult(func() (bool, error) { 195 allocs, err := state.AllocsByNode(nil, n2.ID) 196 if err != nil { 197 return false, err 198 } 199 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 200 }, func(err error) { 201 t.Fatalf("err: %v", err) 202 }) 203 204 // Check that the node drain is removed 205 testutil.WaitForResult(func() (bool, error) { 206 if err := checkAllocPromoter(errCh); err != nil { 207 return false, err 208 } 209 node, err := state.NodeByID(nil, n1.ID) 210 if err != nil { 211 return false, err 212 } 213 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 214 }, func(err error) { 215 t.Fatalf("err: %v", err) 216 }) 217 218 // Check we got the right events 219 node, err := state.NodeByID(nil, n1.ID) 220 require.NoError(err) 221 require.Len(node.Events, 3) 222 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 223 } 224 225 func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) { 226 t.Parallel() 227 require := require.New(t) 228 s1 := TestServer(t, nil) 229 defer s1.Shutdown() 230 codec := rpcClient(t, s1) 231 testutil.WaitForLeader(t, s1.RPC) 232 233 // Create a node 234 n1 := mock.Node() 235 nodeReg := &structs.NodeRegisterRequest{ 236 Node: n1, 237 WriteRequest: structs.WriteRequest{Region: "global"}, 238 } 239 var nodeResp structs.NodeUpdateResponse 240 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 241 242 // Create a job that runs on just one 243 job := mock.Job() 244 job.Update = *structs.DefaultUpdateStrategy 245 job.Update.Stagger = 30 * time.Second 246 job.TaskGroups[0].Count = 2 247 req := &structs.JobRegisterRequest{ 248 Job: job, 249 WriteRequest: structs.WriteRequest{ 250 Region: "global", 251 Namespace: job.Namespace, 252 }, 253 } 254 255 // Fetch the response 256 var resp structs.JobRegisterResponse 257 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 258 require.NotZero(resp.Index) 259 260 // Wait for the two allocations to be placed 261 state := s1.State() 262 testutil.WaitForResult(func() (bool, error) { 263 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 264 if err != nil { 265 return false, err 266 } 267 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 268 }, func(err error) { 269 t.Fatalf("err: %v", err) 270 }) 271 272 // Drain the node 273 drainReq := &structs.NodeUpdateDrainRequest{ 274 NodeID: n1.ID, 275 DrainStrategy: &structs.DrainStrategy{ 276 DrainSpec: structs.DrainSpec{ 277 Deadline: 1 * time.Second, 278 }, 279 }, 280 WriteRequest: structs.WriteRequest{Region: "global"}, 281 } 282 var drainResp structs.NodeDrainUpdateResponse 283 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 284 285 // Wait for the allocs to be stopped 286 testutil.WaitForResult(func() (bool, error) { 287 allocs, err := state.AllocsByNode(nil, n1.ID) 288 if err != nil { 289 return false, err 290 } 291 for _, alloc := range allocs { 292 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 293 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 294 } 295 } 296 return true, nil 297 }, func(err error) { 298 t.Fatalf("err: %v", err) 299 }) 300 301 // Check that the node drain is removed 302 testutil.WaitForResult(func() (bool, error) { 303 node, err := state.NodeByID(nil, n1.ID) 304 if err != nil { 305 return false, err 306 } 307 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 308 }, func(err error) { 309 t.Fatalf("err: %v", err) 310 }) 311 312 // Check we got the right events 313 node, err := state.NodeByID(nil, n1.ID) 314 require.NoError(err) 315 require.Len(node.Events, 3) 316 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 317 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 318 } 319 320 func TestDrainer_DrainEmptyNode(t *testing.T) { 321 t.Parallel() 322 require := require.New(t) 323 s1 := TestServer(t, nil) 324 defer s1.Shutdown() 325 codec := rpcClient(t, s1) 326 testutil.WaitForLeader(t, s1.RPC) 327 328 // Create a node 329 n1 := mock.Node() 330 nodeReg := &structs.NodeRegisterRequest{ 331 Node: n1, 332 WriteRequest: structs.WriteRequest{Region: "global"}, 333 } 334 var nodeResp structs.NodeUpdateResponse 335 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 336 337 // Drain the node 338 drainReq := &structs.NodeUpdateDrainRequest{ 339 NodeID: n1.ID, 340 DrainStrategy: &structs.DrainStrategy{ 341 DrainSpec: structs.DrainSpec{ 342 Deadline: 10 * time.Minute, 343 }, 344 }, 345 WriteRequest: structs.WriteRequest{Region: "global"}, 346 } 347 var drainResp structs.NodeDrainUpdateResponse 348 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 349 350 // Check that the node drain is removed 351 state := s1.State() 352 testutil.WaitForResult(func() (bool, error) { 353 node, err := state.NodeByID(nil, n1.ID) 354 if err != nil { 355 return false, err 356 } 357 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 358 }, func(err error) { 359 t.Fatalf("err: %v", err) 360 }) 361 362 // Check we got the right events 363 node, err := state.NodeByID(nil, n1.ID) 364 require.NoError(err) 365 require.Len(node.Events, 3) 366 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 367 } 368 369 func TestDrainer_AllTypes_Deadline(t *testing.T) { 370 t.Parallel() 371 require := require.New(t) 372 s1 := TestServer(t, nil) 373 defer s1.Shutdown() 374 codec := rpcClient(t, s1) 375 testutil.WaitForLeader(t, s1.RPC) 376 377 // Create two nodes, registering the second later 378 n1, n2 := mock.Node(), mock.Node() 379 nodeReg := &structs.NodeRegisterRequest{ 380 Node: n1, 381 WriteRequest: structs.WriteRequest{Region: "global"}, 382 } 383 var nodeResp structs.NodeUpdateResponse 384 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 385 386 // Create a service job that runs on just one 387 job := mock.Job() 388 job.TaskGroups[0].Count = 2 389 req := &structs.JobRegisterRequest{ 390 Job: job, 391 WriteRequest: structs.WriteRequest{ 392 Region: "global", 393 Namespace: job.Namespace, 394 }, 395 } 396 397 // Fetch the response 398 var resp structs.JobRegisterResponse 399 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 400 require.NotZero(resp.Index) 401 402 // Create a system job 403 sysjob := mock.SystemJob() 404 req = &structs.JobRegisterRequest{ 405 Job: sysjob, 406 WriteRequest: structs.WriteRequest{ 407 Region: "global", 408 Namespace: job.Namespace, 409 }, 410 } 411 412 // Fetch the response 413 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 414 require.NotZero(resp.Index) 415 416 // Create a batch job 417 bjob := mock.BatchJob() 418 bjob.TaskGroups[0].Count = 2 419 req = &structs.JobRegisterRequest{ 420 Job: bjob, 421 WriteRequest: structs.WriteRequest{ 422 Region: "global", 423 Namespace: job.Namespace, 424 }, 425 } 426 427 // Fetch the response 428 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 429 require.NotZero(resp.Index) 430 431 // Wait for the allocations to be placed 432 state := s1.State() 433 testutil.WaitForResult(func() (bool, error) { 434 allocs, err := state.AllocsByNode(nil, n1.ID) 435 if err != nil { 436 return false, err 437 } 438 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 439 }, func(err error) { 440 t.Fatalf("err: %v", err) 441 }) 442 443 // Create the second node 444 nodeReg = &structs.NodeRegisterRequest{ 445 Node: n2, 446 WriteRequest: structs.WriteRequest{Region: "global"}, 447 } 448 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 449 450 // Drain the node 451 drainReq := &structs.NodeUpdateDrainRequest{ 452 NodeID: n1.ID, 453 DrainStrategy: &structs.DrainStrategy{ 454 DrainSpec: structs.DrainSpec{ 455 Deadline: 2 * time.Second, 456 }, 457 }, 458 WriteRequest: structs.WriteRequest{Region: "global"}, 459 } 460 var drainResp structs.NodeDrainUpdateResponse 461 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 462 463 // Wait for the allocs to be replaced 464 errCh := make(chan error, 2) 465 ctx, cancel := context.WithCancel(context.Background()) 466 defer cancel() 467 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 468 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 469 470 // Wait for the allocs to be stopped 471 var finalAllocs []*structs.Allocation 472 testutil.WaitForResult(func() (bool, error) { 473 if err := checkAllocPromoter(errCh); err != nil { 474 return false, err 475 } 476 477 var err error 478 finalAllocs, err = state.AllocsByNode(nil, n1.ID) 479 if err != nil { 480 return false, err 481 } 482 for _, alloc := range finalAllocs { 483 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 484 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 485 } 486 } 487 return true, nil 488 }, func(err error) { 489 t.Fatalf("err: %v", err) 490 }) 491 492 // Check that the node drain is removed 493 testutil.WaitForResult(func() (bool, error) { 494 node, err := state.NodeByID(nil, n1.ID) 495 if err != nil { 496 return false, err 497 } 498 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 499 }, func(err error) { 500 t.Fatalf("err: %v", err) 501 }) 502 503 // Wait for the allocations to be placed on the other node 504 testutil.WaitForResult(func() (bool, error) { 505 allocs, err := state.AllocsByNode(nil, n2.ID) 506 if err != nil { 507 return false, err 508 } 509 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 510 }, func(err error) { 511 t.Fatalf("err: %v", err) 512 }) 513 514 // Assert that the service finished before the batch and system 515 var serviceMax, batchMax uint64 = 0, 0 516 for _, alloc := range finalAllocs { 517 if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax { 518 serviceMax = alloc.ModifyIndex 519 } else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax { 520 batchMax = alloc.ModifyIndex 521 } 522 } 523 require.True(serviceMax < batchMax) 524 525 // Check we got the right events 526 node, err := state.NodeByID(nil, n1.ID) 527 require.NoError(err) 528 require.Len(node.Events, 3) 529 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 530 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 531 } 532 533 // Test that drain is unset when batch jobs naturally finish 534 func TestDrainer_AllTypes_NoDeadline(t *testing.T) { 535 t.Parallel() 536 require := require.New(t) 537 s1 := TestServer(t, nil) 538 defer s1.Shutdown() 539 codec := rpcClient(t, s1) 540 testutil.WaitForLeader(t, s1.RPC) 541 542 // Create two nodes, registering the second later 543 n1, n2 := mock.Node(), mock.Node() 544 nodeReg := &structs.NodeRegisterRequest{ 545 Node: n1, 546 WriteRequest: structs.WriteRequest{Region: "global"}, 547 } 548 var nodeResp structs.NodeUpdateResponse 549 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 550 551 // Create a service job that runs on just one 552 job := mock.Job() 553 job.TaskGroups[0].Count = 2 554 req := &structs.JobRegisterRequest{ 555 Job: job, 556 WriteRequest: structs.WriteRequest{ 557 Region: "global", 558 Namespace: job.Namespace, 559 }, 560 } 561 562 // Fetch the response 563 var resp structs.JobRegisterResponse 564 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 565 require.NotZero(resp.Index) 566 567 // Create a system job 568 sysjob := mock.SystemJob() 569 req = &structs.JobRegisterRequest{ 570 Job: sysjob, 571 WriteRequest: structs.WriteRequest{ 572 Region: "global", 573 Namespace: job.Namespace, 574 }, 575 } 576 577 // Fetch the response 578 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 579 require.NotZero(resp.Index) 580 581 // Create a batch job 582 bjob := mock.BatchJob() 583 bjob.TaskGroups[0].Count = 2 584 req = &structs.JobRegisterRequest{ 585 Job: bjob, 586 WriteRequest: structs.WriteRequest{ 587 Region: "global", 588 Namespace: job.Namespace, 589 }, 590 } 591 592 // Fetch the response 593 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 594 require.NotZero(resp.Index) 595 596 // Wait for the allocations to be placed 597 state := s1.State() 598 testutil.WaitForResult(func() (bool, error) { 599 allocs, err := state.AllocsByNode(nil, n1.ID) 600 if err != nil { 601 return false, err 602 } 603 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 604 }, func(err error) { 605 t.Fatalf("err: %v", err) 606 }) 607 608 // Create the second node 609 nodeReg = &structs.NodeRegisterRequest{ 610 Node: n2, 611 WriteRequest: structs.WriteRequest{Region: "global"}, 612 } 613 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 614 615 // Drain the node 616 drainReq := &structs.NodeUpdateDrainRequest{ 617 NodeID: n1.ID, 618 DrainStrategy: &structs.DrainStrategy{ 619 DrainSpec: structs.DrainSpec{ 620 Deadline: 0 * time.Second, // Infinite 621 }, 622 }, 623 WriteRequest: structs.WriteRequest{Region: "global"}, 624 } 625 var drainResp structs.NodeDrainUpdateResponse 626 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 627 628 // Wait for the allocs to be replaced 629 errCh := make(chan error, 2) 630 ctx, cancel := context.WithCancel(context.Background()) 631 defer cancel() 632 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 633 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 634 635 // Wait for the service allocs to be stopped on the draining node 636 testutil.WaitForResult(func() (bool, error) { 637 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 638 if err != nil { 639 return false, err 640 } 641 for _, alloc := range allocs { 642 if alloc.NodeID != n1.ID { 643 continue 644 } 645 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 646 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 647 } 648 } 649 if err := checkAllocPromoter(errCh); err != nil { 650 return false, err 651 } 652 return true, nil 653 }, func(err error) { 654 t.Fatalf("err: %v", err) 655 }) 656 657 // Mark the batch allocations as finished 658 allocs, err := state.AllocsByJob(nil, job.Namespace, bjob.ID, false) 659 require.Nil(err) 660 661 var updates []*structs.Allocation 662 for _, alloc := range allocs { 663 new := alloc.Copy() 664 new.ClientStatus = structs.AllocClientStatusComplete 665 updates = append(updates, new) 666 } 667 require.Nil(state.UpdateAllocsFromClient(1000, updates)) 668 669 // Check that the node drain is removed 670 testutil.WaitForResult(func() (bool, error) { 671 node, err := state.NodeByID(nil, n1.ID) 672 if err != nil { 673 return false, err 674 } 675 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 676 }, func(err error) { 677 t.Fatalf("err: %v", err) 678 }) 679 680 // Wait for the service allocations to be placed on the other node 681 testutil.WaitForResult(func() (bool, error) { 682 allocs, err := state.AllocsByNode(nil, n2.ID) 683 if err != nil { 684 return false, err 685 } 686 return len(allocs) == 3, fmt.Errorf("got %d allocs", len(allocs)) 687 }, func(err error) { 688 t.Fatalf("err: %v", err) 689 }) 690 691 // Check we got the right events 692 node, err := state.NodeByID(nil, n1.ID) 693 require.NoError(err) 694 require.Len(node.Events, 3) 695 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 696 } 697 698 func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) { 699 t.Parallel() 700 require := require.New(t) 701 s1 := TestServer(t, nil) 702 defer s1.Shutdown() 703 codec := rpcClient(t, s1) 704 testutil.WaitForLeader(t, s1.RPC) 705 706 // Create two nodes, registering the second later 707 n1, n2 := mock.Node(), mock.Node() 708 nodeReg := &structs.NodeRegisterRequest{ 709 Node: n1, 710 WriteRequest: structs.WriteRequest{Region: "global"}, 711 } 712 var nodeResp structs.NodeUpdateResponse 713 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 714 715 // Create a service job that runs on just one 716 job := mock.Job() 717 job.TaskGroups[0].Count = 2 718 req := &structs.JobRegisterRequest{ 719 Job: job, 720 WriteRequest: structs.WriteRequest{ 721 Region: "global", 722 Namespace: job.Namespace, 723 }, 724 } 725 726 // Fetch the response 727 var resp structs.JobRegisterResponse 728 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 729 require.NotZero(resp.Index) 730 job.CreateIndex = resp.JobModifyIndex 731 732 // Create a system job 733 sysjob := mock.SystemJob() 734 req = &structs.JobRegisterRequest{ 735 Job: sysjob, 736 WriteRequest: structs.WriteRequest{ 737 Region: "global", 738 Namespace: job.Namespace, 739 }, 740 } 741 742 // Fetch the response 743 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 744 require.NotZero(resp.Index) 745 sysjob.CreateIndex = resp.JobModifyIndex 746 747 // Create a batch job 748 bjob := mock.BatchJob() 749 bjob.TaskGroups[0].Count = 2 750 req = &structs.JobRegisterRequest{ 751 Job: bjob, 752 WriteRequest: structs.WriteRequest{ 753 Region: "global", 754 Namespace: job.Namespace, 755 }, 756 } 757 758 // Fetch the response 759 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 760 require.NotZero(resp.Index) 761 bjob.CreateIndex = resp.JobModifyIndex 762 763 // Wait for the allocations to be placed 764 state := s1.State() 765 testutil.WaitForResult(func() (bool, error) { 766 allocs, err := state.AllocsByNode(nil, n1.ID) 767 if err != nil { 768 return false, err 769 } 770 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 771 }, func(err error) { 772 t.Fatalf("err: %v", err) 773 }) 774 775 // Create some old terminal allocs for each job that point at a non-existent 776 // node to simulate it being on a GC'd node. 777 var badAllocs []*structs.Allocation 778 for _, job := range []*structs.Job{job, sysjob, bjob} { 779 alloc := mock.Alloc() 780 alloc.Namespace = job.Namespace 781 alloc.Job = job 782 alloc.JobID = job.ID 783 alloc.NodeID = uuid.Generate() 784 alloc.TaskGroup = job.TaskGroups[0].Name 785 alloc.DesiredStatus = structs.AllocDesiredStatusStop 786 alloc.ClientStatus = structs.AllocClientStatusComplete 787 badAllocs = append(badAllocs, alloc) 788 } 789 require.NoError(state.UpsertAllocs(1, badAllocs)) 790 791 // Create the second node 792 nodeReg = &structs.NodeRegisterRequest{ 793 Node: n2, 794 WriteRequest: structs.WriteRequest{Region: "global"}, 795 } 796 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 797 798 // Drain the node 799 drainReq := &structs.NodeUpdateDrainRequest{ 800 NodeID: n1.ID, 801 DrainStrategy: &structs.DrainStrategy{ 802 DrainSpec: structs.DrainSpec{ 803 Deadline: 2 * time.Second, 804 }, 805 }, 806 WriteRequest: structs.WriteRequest{Region: "global"}, 807 } 808 var drainResp structs.NodeDrainUpdateResponse 809 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 810 811 // Wait for the allocs to be replaced 812 errCh := make(chan error, 2) 813 ctx, cancel := context.WithCancel(context.Background()) 814 defer cancel() 815 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 816 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 817 818 // Wait for the allocs to be stopped 819 var finalAllocs []*structs.Allocation 820 testutil.WaitForResult(func() (bool, error) { 821 if err := checkAllocPromoter(errCh); err != nil { 822 return false, err 823 } 824 825 var err error 826 finalAllocs, err = state.AllocsByNode(nil, n1.ID) 827 if err != nil { 828 return false, err 829 } 830 for _, alloc := range finalAllocs { 831 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 832 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 833 } 834 } 835 return true, nil 836 }, func(err error) { 837 t.Fatalf("err: %v", err) 838 }) 839 840 // Check that the node drain is removed 841 testutil.WaitForResult(func() (bool, error) { 842 node, err := state.NodeByID(nil, n1.ID) 843 if err != nil { 844 return false, err 845 } 846 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 847 }, func(err error) { 848 t.Fatalf("err: %v", err) 849 }) 850 851 // Wait for the allocations to be placed on the other node 852 testutil.WaitForResult(func() (bool, error) { 853 allocs, err := state.AllocsByNode(nil, n2.ID) 854 if err != nil { 855 return false, err 856 } 857 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 858 }, func(err error) { 859 t.Fatalf("err: %v", err) 860 }) 861 862 // Check we got the right events 863 node, err := state.NodeByID(nil, n1.ID) 864 require.NoError(err) 865 require.Len(node.Events, 3) 866 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 867 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 868 } 869 870 // Test that transitions to force drain work. 871 func TestDrainer_Batch_TransitionToForce(t *testing.T) { 872 t.Parallel() 873 874 for _, inf := range []bool{true, false} { 875 name := "Infinite" 876 if !inf { 877 name = "Deadline" 878 } 879 t.Run(name, func(t *testing.T) { 880 require := require.New(t) 881 s1 := TestServer(t, nil) 882 defer s1.Shutdown() 883 codec := rpcClient(t, s1) 884 testutil.WaitForLeader(t, s1.RPC) 885 886 // Create a node 887 n1 := mock.Node() 888 nodeReg := &structs.NodeRegisterRequest{ 889 Node: n1, 890 WriteRequest: structs.WriteRequest{Region: "global"}, 891 } 892 var nodeResp structs.NodeUpdateResponse 893 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 894 895 // Create a batch job 896 bjob := mock.BatchJob() 897 bjob.TaskGroups[0].Count = 2 898 req := &structs.JobRegisterRequest{ 899 Job: bjob, 900 WriteRequest: structs.WriteRequest{ 901 Region: "global", 902 Namespace: bjob.Namespace, 903 }, 904 } 905 906 // Fetch the response 907 var resp structs.JobRegisterResponse 908 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 909 require.NotZero(resp.Index) 910 911 // Wait for the allocations to be placed 912 state := s1.State() 913 testutil.WaitForResult(func() (bool, error) { 914 allocs, err := state.AllocsByNode(nil, n1.ID) 915 if err != nil { 916 return false, err 917 } 918 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 919 }, func(err error) { 920 t.Fatalf("err: %v", err) 921 }) 922 923 // Pick the deadline 924 deadline := 0 * time.Second 925 if !inf { 926 deadline = 10 * time.Second 927 } 928 929 // Drain the node 930 drainReq := &structs.NodeUpdateDrainRequest{ 931 NodeID: n1.ID, 932 DrainStrategy: &structs.DrainStrategy{ 933 DrainSpec: structs.DrainSpec{ 934 Deadline: deadline, 935 }, 936 }, 937 WriteRequest: structs.WriteRequest{Region: "global"}, 938 } 939 var drainResp structs.NodeDrainUpdateResponse 940 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 941 942 // Wait for the allocs to be replaced 943 errCh := make(chan error, 1) 944 ctx, cancel := context.WithCancel(context.Background()) 945 defer cancel() 946 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 947 948 // Make sure the batch job isn't affected 949 testutil.AssertUntil(500*time.Millisecond, func() (bool, error) { 950 if err := checkAllocPromoter(errCh); err != nil { 951 return false, fmt.Errorf("check alloc promoter error: %v", err) 952 } 953 954 allocs, err := state.AllocsByNode(nil, n1.ID) 955 if err != nil { 956 return false, fmt.Errorf("AllocsByNode error: %v", err) 957 } 958 for _, alloc := range allocs { 959 if alloc.DesiredStatus != structs.AllocDesiredStatusRun { 960 return false, fmt.Errorf("got status %v", alloc.DesiredStatus) 961 } 962 } 963 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 964 }, func(err error) { 965 t.Fatalf("err: %v", err) 966 }) 967 968 // Foce drain the node 969 drainReq = &structs.NodeUpdateDrainRequest{ 970 NodeID: n1.ID, 971 DrainStrategy: &structs.DrainStrategy{ 972 DrainSpec: structs.DrainSpec{ 973 Deadline: -1 * time.Second, // Infinite 974 }, 975 }, 976 WriteRequest: structs.WriteRequest{Region: "global"}, 977 } 978 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 979 980 // Make sure the batch job is migrated 981 testutil.WaitForResult(func() (bool, error) { 982 allocs, err := state.AllocsByNode(nil, n1.ID) 983 if err != nil { 984 return false, err 985 } 986 for _, alloc := range allocs { 987 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 988 return false, fmt.Errorf("got status %v", alloc.DesiredStatus) 989 } 990 } 991 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 992 }, func(err error) { 993 t.Fatalf("err: %v", err) 994 }) 995 996 // Check that the node drain is removed 997 testutil.WaitForResult(func() (bool, error) { 998 node, err := state.NodeByID(nil, n1.ID) 999 if err != nil { 1000 return false, err 1001 } 1002 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 1003 }, func(err error) { 1004 t.Fatalf("err: %v", err) 1005 }) 1006 1007 // Check we got the right events 1008 node, err := state.NodeByID(nil, n1.ID) 1009 require.NoError(err) 1010 require.Len(node.Events, 4) 1011 require.Equal(drainer.NodeDrainEventComplete, node.Events[3].Message) 1012 require.Contains(node.Events[3].Details, drainer.NodeDrainEventDetailDeadlined) 1013 }) 1014 } 1015 }