github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/drainer_int_test.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "net/rpc" 8 "testing" 9 "time" 10 11 memdb "github.com/hashicorp/go-memdb" 12 msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" 13 "github.com/hashicorp/nomad/helper" 14 "github.com/hashicorp/nomad/helper/uuid" 15 "github.com/hashicorp/nomad/nomad/drainer" 16 "github.com/hashicorp/nomad/nomad/mock" 17 "github.com/hashicorp/nomad/nomad/state" 18 "github.com/hashicorp/nomad/nomad/structs" 19 "github.com/hashicorp/nomad/testutil" 20 "github.com/stretchr/testify/require" 21 ) 22 23 func allocPromoter(errCh chan<- error, ctx context.Context, 24 state *state.StateStore, codec rpc.ClientCodec, nodeID string, 25 logger *log.Logger) { 26 27 nindex := uint64(1) 28 for { 29 allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex) 30 if err != nil { 31 if err == context.Canceled { 32 return 33 } 34 35 errCh <- fmt.Errorf("failed to get node allocs: %v", err) 36 return 37 } 38 nindex = index 39 40 // For each alloc that doesn't have its deployment status set, set it 41 var updates []*structs.Allocation 42 now := time.Now() 43 for _, alloc := range allocs { 44 if alloc.Job.Type != structs.JobTypeService { 45 continue 46 } 47 48 if alloc.DeploymentStatus.HasHealth() { 49 continue 50 } 51 newAlloc := alloc.Copy() 52 newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{ 53 Healthy: helper.BoolToPtr(true), 54 Timestamp: now, 55 } 56 updates = append(updates, newAlloc) 57 logger.Printf("Marked deployment health for alloc %q", alloc.ID) 58 } 59 60 if len(updates) == 0 { 61 continue 62 } 63 64 // Send the update 65 req := &structs.AllocUpdateRequest{ 66 Alloc: updates, 67 WriteRequest: structs.WriteRequest{Region: "global"}, 68 } 69 var resp structs.GenericResponse 70 if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil { 71 if ctx.Err() == context.Canceled { 72 return 73 } else if err != nil { 74 errCh <- err 75 } 76 } 77 } 78 } 79 80 // checkAllocPromoter is a small helper to return an error or nil from an error 81 // chan like the one given to the allocPromoter goroutine. 82 func checkAllocPromoter(errCh chan error) error { 83 select { 84 case err := <-errCh: 85 return err 86 default: 87 return nil 88 } 89 } 90 91 func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) { 92 resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx) 93 if err != nil { 94 return nil, 0, err 95 } 96 if err := ctx.Err(); err != nil { 97 return nil, 0, err 98 } 99 100 return resp.([]*structs.Allocation), index, nil 101 } 102 103 func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 104 return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 105 // Capture all the allocations 106 allocs, err := state.AllocsByNode(ws, nodeID) 107 if err != nil { 108 return nil, 0, err 109 } 110 111 // Use the last index that affected the jobs table 112 index, err := state.Index("allocs") 113 if err != nil { 114 return nil, index, err 115 } 116 117 return allocs, index, nil 118 } 119 } 120 121 func TestDrainer_Simple_ServiceOnly(t *testing.T) { 122 t.Parallel() 123 require := require.New(t) 124 s1 := TestServer(t, nil) 125 defer s1.Shutdown() 126 codec := rpcClient(t, s1) 127 testutil.WaitForLeader(t, s1.RPC) 128 129 // Create two nodes 130 n1, n2 := mock.Node(), mock.Node() 131 nodeReg := &structs.NodeRegisterRequest{ 132 Node: n1, 133 WriteRequest: structs.WriteRequest{Region: "global"}, 134 } 135 var nodeResp structs.NodeUpdateResponse 136 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 137 138 // Create a job that runs on just one 139 job := mock.Job() 140 job.TaskGroups[0].Count = 2 141 req := &structs.JobRegisterRequest{ 142 Job: job, 143 WriteRequest: structs.WriteRequest{ 144 Region: "global", 145 Namespace: job.Namespace, 146 }, 147 } 148 149 // Fetch the response 150 var resp structs.JobRegisterResponse 151 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 152 require.NotZero(resp.Index) 153 154 // Wait for the two allocations to be placed 155 state := s1.State() 156 testutil.WaitForResult(func() (bool, error) { 157 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 158 if err != nil { 159 return false, err 160 } 161 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 162 }, func(err error) { 163 t.Fatalf("err: %v", err) 164 }) 165 166 // Create the second node 167 nodeReg = &structs.NodeRegisterRequest{ 168 Node: n2, 169 WriteRequest: structs.WriteRequest{Region: "global"}, 170 } 171 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 172 173 // Drain the first node 174 drainReq := &structs.NodeUpdateDrainRequest{ 175 NodeID: n1.ID, 176 DrainStrategy: &structs.DrainStrategy{ 177 DrainSpec: structs.DrainSpec{ 178 Deadline: 10 * time.Minute, 179 }, 180 }, 181 WriteRequest: structs.WriteRequest{Region: "global"}, 182 } 183 var drainResp structs.NodeDrainUpdateResponse 184 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 185 186 // Wait for the allocs to be replaced 187 errCh := make(chan error, 2) 188 ctx, cancel := context.WithCancel(context.Background()) 189 defer cancel() 190 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 191 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 192 193 testutil.WaitForResult(func() (bool, error) { 194 allocs, err := state.AllocsByNode(nil, n2.ID) 195 if err != nil { 196 return false, err 197 } 198 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 199 }, func(err error) { 200 t.Fatalf("err: %v", err) 201 }) 202 203 // Check that the node drain is removed 204 testutil.WaitForResult(func() (bool, error) { 205 if err := checkAllocPromoter(errCh); err != nil { 206 return false, err 207 } 208 node, err := state.NodeByID(nil, n1.ID) 209 if err != nil { 210 return false, err 211 } 212 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 213 }, func(err error) { 214 t.Fatalf("err: %v", err) 215 }) 216 217 // Check we got the right events 218 node, err := state.NodeByID(nil, n1.ID) 219 require.NoError(err) 220 require.Len(node.Events, 3) 221 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 222 } 223 224 func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) { 225 t.Parallel() 226 require := require.New(t) 227 s1 := TestServer(t, nil) 228 defer s1.Shutdown() 229 codec := rpcClient(t, s1) 230 testutil.WaitForLeader(t, s1.RPC) 231 232 // Create a node 233 n1 := mock.Node() 234 nodeReg := &structs.NodeRegisterRequest{ 235 Node: n1, 236 WriteRequest: structs.WriteRequest{Region: "global"}, 237 } 238 var nodeResp structs.NodeUpdateResponse 239 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 240 241 // Create a job that runs on just one 242 job := mock.Job() 243 job.Update = *structs.DefaultUpdateStrategy 244 job.Update.Stagger = 30 * time.Second 245 job.TaskGroups[0].Count = 2 246 req := &structs.JobRegisterRequest{ 247 Job: job, 248 WriteRequest: structs.WriteRequest{ 249 Region: "global", 250 Namespace: job.Namespace, 251 }, 252 } 253 254 // Fetch the response 255 var resp structs.JobRegisterResponse 256 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 257 require.NotZero(resp.Index) 258 259 // Wait for the two allocations to be placed 260 state := s1.State() 261 testutil.WaitForResult(func() (bool, error) { 262 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 263 if err != nil { 264 return false, err 265 } 266 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 267 }, func(err error) { 268 t.Fatalf("err: %v", err) 269 }) 270 271 // Drain the node 272 drainReq := &structs.NodeUpdateDrainRequest{ 273 NodeID: n1.ID, 274 DrainStrategy: &structs.DrainStrategy{ 275 DrainSpec: structs.DrainSpec{ 276 Deadline: 1 * time.Second, 277 }, 278 }, 279 WriteRequest: structs.WriteRequest{Region: "global"}, 280 } 281 var drainResp structs.NodeDrainUpdateResponse 282 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 283 284 // Wait for the allocs to be stopped 285 testutil.WaitForResult(func() (bool, error) { 286 allocs, err := state.AllocsByNode(nil, n1.ID) 287 if err != nil { 288 return false, err 289 } 290 for _, alloc := range allocs { 291 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 292 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 293 } 294 } 295 return true, nil 296 }, func(err error) { 297 t.Fatalf("err: %v", err) 298 }) 299 300 // Check that the node drain is removed 301 testutil.WaitForResult(func() (bool, error) { 302 node, err := state.NodeByID(nil, n1.ID) 303 if err != nil { 304 return false, err 305 } 306 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 307 }, func(err error) { 308 t.Fatalf("err: %v", err) 309 }) 310 311 // Check we got the right events 312 node, err := state.NodeByID(nil, n1.ID) 313 require.NoError(err) 314 require.Len(node.Events, 3) 315 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 316 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 317 } 318 319 func TestDrainer_DrainEmptyNode(t *testing.T) { 320 t.Parallel() 321 require := require.New(t) 322 s1 := TestServer(t, nil) 323 defer s1.Shutdown() 324 codec := rpcClient(t, s1) 325 testutil.WaitForLeader(t, s1.RPC) 326 327 // Create a node 328 n1 := mock.Node() 329 nodeReg := &structs.NodeRegisterRequest{ 330 Node: n1, 331 WriteRequest: structs.WriteRequest{Region: "global"}, 332 } 333 var nodeResp structs.NodeUpdateResponse 334 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 335 336 // Drain the node 337 drainReq := &structs.NodeUpdateDrainRequest{ 338 NodeID: n1.ID, 339 DrainStrategy: &structs.DrainStrategy{ 340 DrainSpec: structs.DrainSpec{ 341 Deadline: 10 * time.Minute, 342 }, 343 }, 344 WriteRequest: structs.WriteRequest{Region: "global"}, 345 } 346 var drainResp structs.NodeDrainUpdateResponse 347 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 348 349 // Check that the node drain is removed 350 state := s1.State() 351 testutil.WaitForResult(func() (bool, error) { 352 node, err := state.NodeByID(nil, n1.ID) 353 if err != nil { 354 return false, err 355 } 356 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 357 }, func(err error) { 358 t.Fatalf("err: %v", err) 359 }) 360 361 // Check we got the right events 362 node, err := state.NodeByID(nil, n1.ID) 363 require.NoError(err) 364 require.Len(node.Events, 3) 365 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 366 } 367 368 func TestDrainer_AllTypes_Deadline(t *testing.T) { 369 t.Parallel() 370 require := require.New(t) 371 s1 := TestServer(t, nil) 372 defer s1.Shutdown() 373 codec := rpcClient(t, s1) 374 testutil.WaitForLeader(t, s1.RPC) 375 376 // Create two nodes, registering the second later 377 n1, n2 := mock.Node(), mock.Node() 378 nodeReg := &structs.NodeRegisterRequest{ 379 Node: n1, 380 WriteRequest: structs.WriteRequest{Region: "global"}, 381 } 382 var nodeResp structs.NodeUpdateResponse 383 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 384 385 // Create a service job that runs on just one 386 job := mock.Job() 387 job.TaskGroups[0].Count = 2 388 req := &structs.JobRegisterRequest{ 389 Job: job, 390 WriteRequest: structs.WriteRequest{ 391 Region: "global", 392 Namespace: job.Namespace, 393 }, 394 } 395 396 // Fetch the response 397 var resp structs.JobRegisterResponse 398 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 399 require.NotZero(resp.Index) 400 401 // Create a system job 402 sysjob := mock.SystemJob() 403 req = &structs.JobRegisterRequest{ 404 Job: sysjob, 405 WriteRequest: structs.WriteRequest{ 406 Region: "global", 407 Namespace: job.Namespace, 408 }, 409 } 410 411 // Fetch the response 412 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 413 require.NotZero(resp.Index) 414 415 // Create a batch job 416 bjob := mock.BatchJob() 417 bjob.TaskGroups[0].Count = 2 418 req = &structs.JobRegisterRequest{ 419 Job: bjob, 420 WriteRequest: structs.WriteRequest{ 421 Region: "global", 422 Namespace: job.Namespace, 423 }, 424 } 425 426 // Fetch the response 427 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 428 require.NotZero(resp.Index) 429 430 // Wait for the allocations to be placed 431 state := s1.State() 432 testutil.WaitForResult(func() (bool, error) { 433 allocs, err := state.AllocsByNode(nil, n1.ID) 434 if err != nil { 435 return false, err 436 } 437 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 438 }, func(err error) { 439 t.Fatalf("err: %v", err) 440 }) 441 442 // Create the second node 443 nodeReg = &structs.NodeRegisterRequest{ 444 Node: n2, 445 WriteRequest: structs.WriteRequest{Region: "global"}, 446 } 447 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 448 449 // Drain the node 450 drainReq := &structs.NodeUpdateDrainRequest{ 451 NodeID: n1.ID, 452 DrainStrategy: &structs.DrainStrategy{ 453 DrainSpec: structs.DrainSpec{ 454 Deadline: 2 * time.Second, 455 }, 456 }, 457 WriteRequest: structs.WriteRequest{Region: "global"}, 458 } 459 var drainResp structs.NodeDrainUpdateResponse 460 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 461 462 // Wait for the allocs to be replaced 463 errCh := make(chan error, 2) 464 ctx, cancel := context.WithCancel(context.Background()) 465 defer cancel() 466 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 467 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 468 469 // Wait for the allocs to be stopped 470 var finalAllocs []*structs.Allocation 471 testutil.WaitForResult(func() (bool, error) { 472 if err := checkAllocPromoter(errCh); err != nil { 473 return false, err 474 } 475 476 var err error 477 finalAllocs, err = state.AllocsByNode(nil, n1.ID) 478 if err != nil { 479 return false, err 480 } 481 for _, alloc := range finalAllocs { 482 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 483 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 484 } 485 } 486 return true, nil 487 }, func(err error) { 488 t.Fatalf("err: %v", err) 489 }) 490 491 // Check that the node drain is removed 492 testutil.WaitForResult(func() (bool, error) { 493 node, err := state.NodeByID(nil, n1.ID) 494 if err != nil { 495 return false, err 496 } 497 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 498 }, func(err error) { 499 t.Fatalf("err: %v", err) 500 }) 501 502 // Wait for the allocations to be placed on the other node 503 testutil.WaitForResult(func() (bool, error) { 504 allocs, err := state.AllocsByNode(nil, n2.ID) 505 if err != nil { 506 return false, err 507 } 508 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 509 }, func(err error) { 510 t.Fatalf("err: %v", err) 511 }) 512 513 // Assert that the service finished before the batch and system 514 var serviceMax, batchMax uint64 = 0, 0 515 for _, alloc := range finalAllocs { 516 if alloc.Job.Type == structs.JobTypeService && alloc.ModifyIndex > serviceMax { 517 serviceMax = alloc.ModifyIndex 518 } else if alloc.Job.Type == structs.JobTypeBatch && alloc.ModifyIndex > batchMax { 519 batchMax = alloc.ModifyIndex 520 } 521 } 522 require.True(serviceMax < batchMax) 523 524 // Check we got the right events 525 node, err := state.NodeByID(nil, n1.ID) 526 require.NoError(err) 527 require.Len(node.Events, 3) 528 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 529 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 530 } 531 532 // Test that drain is unset when batch jobs naturally finish 533 func TestDrainer_AllTypes_NoDeadline(t *testing.T) { 534 t.Parallel() 535 require := require.New(t) 536 s1 := TestServer(t, nil) 537 defer s1.Shutdown() 538 codec := rpcClient(t, s1) 539 testutil.WaitForLeader(t, s1.RPC) 540 541 // Create two nodes, registering the second later 542 n1, n2 := mock.Node(), mock.Node() 543 nodeReg := &structs.NodeRegisterRequest{ 544 Node: n1, 545 WriteRequest: structs.WriteRequest{Region: "global"}, 546 } 547 var nodeResp structs.NodeUpdateResponse 548 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 549 550 // Create a service job that runs on just one 551 job := mock.Job() 552 job.TaskGroups[0].Count = 2 553 req := &structs.JobRegisterRequest{ 554 Job: job, 555 WriteRequest: structs.WriteRequest{ 556 Region: "global", 557 Namespace: job.Namespace, 558 }, 559 } 560 561 // Fetch the response 562 var resp structs.JobRegisterResponse 563 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 564 require.NotZero(resp.Index) 565 566 // Create a system job 567 sysjob := mock.SystemJob() 568 req = &structs.JobRegisterRequest{ 569 Job: sysjob, 570 WriteRequest: structs.WriteRequest{ 571 Region: "global", 572 Namespace: job.Namespace, 573 }, 574 } 575 576 // Fetch the response 577 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 578 require.NotZero(resp.Index) 579 580 // Create a batch job 581 bjob := mock.BatchJob() 582 bjob.TaskGroups[0].Count = 2 583 req = &structs.JobRegisterRequest{ 584 Job: bjob, 585 WriteRequest: structs.WriteRequest{ 586 Region: "global", 587 Namespace: job.Namespace, 588 }, 589 } 590 591 // Fetch the response 592 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 593 require.NotZero(resp.Index) 594 595 // Wait for the allocations to be placed 596 state := s1.State() 597 testutil.WaitForResult(func() (bool, error) { 598 allocs, err := state.AllocsByNode(nil, n1.ID) 599 if err != nil { 600 return false, err 601 } 602 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 603 }, func(err error) { 604 t.Fatalf("err: %v", err) 605 }) 606 607 // Create the second node 608 nodeReg = &structs.NodeRegisterRequest{ 609 Node: n2, 610 WriteRequest: structs.WriteRequest{Region: "global"}, 611 } 612 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 613 614 // Drain the node 615 drainReq := &structs.NodeUpdateDrainRequest{ 616 NodeID: n1.ID, 617 DrainStrategy: &structs.DrainStrategy{ 618 DrainSpec: structs.DrainSpec{ 619 Deadline: 0 * time.Second, // Infinite 620 }, 621 }, 622 WriteRequest: structs.WriteRequest{Region: "global"}, 623 } 624 var drainResp structs.NodeDrainUpdateResponse 625 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 626 627 // Wait for the allocs to be replaced 628 errCh := make(chan error, 2) 629 ctx, cancel := context.WithCancel(context.Background()) 630 defer cancel() 631 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 632 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 633 634 // Wait for the service allocs to be stopped on the draining node 635 testutil.WaitForResult(func() (bool, error) { 636 allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) 637 if err != nil { 638 return false, err 639 } 640 for _, alloc := range allocs { 641 if alloc.NodeID != n1.ID { 642 continue 643 } 644 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 645 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 646 } 647 } 648 if err := checkAllocPromoter(errCh); err != nil { 649 return false, err 650 } 651 return true, nil 652 }, func(err error) { 653 t.Fatalf("err: %v", err) 654 }) 655 656 // Mark the batch allocations as finished 657 allocs, err := state.AllocsByJob(nil, job.Namespace, bjob.ID, false) 658 require.Nil(err) 659 660 var updates []*structs.Allocation 661 for _, alloc := range allocs { 662 new := alloc.Copy() 663 new.ClientStatus = structs.AllocClientStatusComplete 664 updates = append(updates, new) 665 } 666 require.Nil(state.UpdateAllocsFromClient(1000, updates)) 667 668 // Check that the node drain is removed 669 testutil.WaitForResult(func() (bool, error) { 670 node, err := state.NodeByID(nil, n1.ID) 671 if err != nil { 672 return false, err 673 } 674 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 675 }, func(err error) { 676 t.Fatalf("err: %v", err) 677 }) 678 679 // Wait for the service allocations to be placed on the other node 680 testutil.WaitForResult(func() (bool, error) { 681 allocs, err := state.AllocsByNode(nil, n2.ID) 682 if err != nil { 683 return false, err 684 } 685 return len(allocs) == 3, fmt.Errorf("got %d allocs", len(allocs)) 686 }, func(err error) { 687 t.Fatalf("err: %v", err) 688 }) 689 690 // Check we got the right events 691 node, err := state.NodeByID(nil, n1.ID) 692 require.NoError(err) 693 require.Len(node.Events, 3) 694 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 695 } 696 697 func TestDrainer_AllTypes_Deadline_GarbageCollectedNode(t *testing.T) { 698 t.Parallel() 699 require := require.New(t) 700 s1 := TestServer(t, nil) 701 defer s1.Shutdown() 702 codec := rpcClient(t, s1) 703 testutil.WaitForLeader(t, s1.RPC) 704 705 // Create two nodes, registering the second later 706 n1, n2 := mock.Node(), mock.Node() 707 nodeReg := &structs.NodeRegisterRequest{ 708 Node: n1, 709 WriteRequest: structs.WriteRequest{Region: "global"}, 710 } 711 var nodeResp structs.NodeUpdateResponse 712 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 713 714 // Create a service job that runs on just one 715 job := mock.Job() 716 job.TaskGroups[0].Count = 2 717 req := &structs.JobRegisterRequest{ 718 Job: job, 719 WriteRequest: structs.WriteRequest{ 720 Region: "global", 721 Namespace: job.Namespace, 722 }, 723 } 724 725 // Fetch the response 726 var resp structs.JobRegisterResponse 727 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 728 require.NotZero(resp.Index) 729 job.CreateIndex = resp.JobModifyIndex 730 731 // Create a system job 732 sysjob := mock.SystemJob() 733 req = &structs.JobRegisterRequest{ 734 Job: sysjob, 735 WriteRequest: structs.WriteRequest{ 736 Region: "global", 737 Namespace: job.Namespace, 738 }, 739 } 740 741 // Fetch the response 742 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 743 require.NotZero(resp.Index) 744 sysjob.CreateIndex = resp.JobModifyIndex 745 746 // Create a batch job 747 bjob := mock.BatchJob() 748 bjob.TaskGroups[0].Count = 2 749 req = &structs.JobRegisterRequest{ 750 Job: bjob, 751 WriteRequest: structs.WriteRequest{ 752 Region: "global", 753 Namespace: job.Namespace, 754 }, 755 } 756 757 // Fetch the response 758 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 759 require.NotZero(resp.Index) 760 bjob.CreateIndex = resp.JobModifyIndex 761 762 // Wait for the allocations to be placed 763 state := s1.State() 764 testutil.WaitForResult(func() (bool, error) { 765 allocs, err := state.AllocsByNode(nil, n1.ID) 766 if err != nil { 767 return false, err 768 } 769 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 770 }, func(err error) { 771 t.Fatalf("err: %v", err) 772 }) 773 774 // Create some old terminal allocs for each job that point at a non-existent 775 // node to simulate it being on a GC'd node. 776 var badAllocs []*structs.Allocation 777 for _, job := range []*structs.Job{job, sysjob, bjob} { 778 alloc := mock.Alloc() 779 alloc.Namespace = job.Namespace 780 alloc.Job = job 781 alloc.JobID = job.ID 782 alloc.NodeID = uuid.Generate() 783 alloc.TaskGroup = job.TaskGroups[0].Name 784 alloc.DesiredStatus = structs.AllocDesiredStatusStop 785 alloc.ClientStatus = structs.AllocClientStatusComplete 786 badAllocs = append(badAllocs, alloc) 787 } 788 require.NoError(state.UpsertAllocs(1, badAllocs)) 789 790 // Create the second node 791 nodeReg = &structs.NodeRegisterRequest{ 792 Node: n2, 793 WriteRequest: structs.WriteRequest{Region: "global"}, 794 } 795 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 796 797 // Drain the node 798 drainReq := &structs.NodeUpdateDrainRequest{ 799 NodeID: n1.ID, 800 DrainStrategy: &structs.DrainStrategy{ 801 DrainSpec: structs.DrainSpec{ 802 Deadline: 2 * time.Second, 803 }, 804 }, 805 WriteRequest: structs.WriteRequest{Region: "global"}, 806 } 807 var drainResp structs.NodeDrainUpdateResponse 808 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 809 810 // Wait for the allocs to be replaced 811 errCh := make(chan error, 2) 812 ctx, cancel := context.WithCancel(context.Background()) 813 defer cancel() 814 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 815 go allocPromoter(errCh, ctx, state, codec, n2.ID, s1.logger) 816 817 // Wait for the allocs to be stopped 818 var finalAllocs []*structs.Allocation 819 testutil.WaitForResult(func() (bool, error) { 820 if err := checkAllocPromoter(errCh); err != nil { 821 return false, err 822 } 823 824 var err error 825 finalAllocs, err = state.AllocsByNode(nil, n1.ID) 826 if err != nil { 827 return false, err 828 } 829 for _, alloc := range finalAllocs { 830 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 831 return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) 832 } 833 } 834 return true, nil 835 }, func(err error) { 836 t.Fatalf("err: %v", err) 837 }) 838 839 // Check that the node drain is removed 840 testutil.WaitForResult(func() (bool, error) { 841 node, err := state.NodeByID(nil, n1.ID) 842 if err != nil { 843 return false, err 844 } 845 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 846 }, func(err error) { 847 t.Fatalf("err: %v", err) 848 }) 849 850 // Wait for the allocations to be placed on the other node 851 testutil.WaitForResult(func() (bool, error) { 852 allocs, err := state.AllocsByNode(nil, n2.ID) 853 if err != nil { 854 return false, err 855 } 856 return len(allocs) == 5, fmt.Errorf("got %d allocs", len(allocs)) 857 }, func(err error) { 858 t.Fatalf("err: %v", err) 859 }) 860 861 // Check we got the right events 862 node, err := state.NodeByID(nil, n1.ID) 863 require.NoError(err) 864 require.Len(node.Events, 3) 865 require.Equal(drainer.NodeDrainEventComplete, node.Events[2].Message) 866 require.Contains(node.Events[2].Details, drainer.NodeDrainEventDetailDeadlined) 867 } 868 869 // Test that transitions to force drain work. 870 func TestDrainer_Batch_TransitionToForce(t *testing.T) { 871 t.Parallel() 872 require := require.New(t) 873 874 for _, inf := range []bool{true, false} { 875 name := "Infinite" 876 if !inf { 877 name = "Deadline" 878 } 879 t.Run(name, func(t *testing.T) { 880 s1 := TestServer(t, nil) 881 defer s1.Shutdown() 882 codec := rpcClient(t, s1) 883 testutil.WaitForLeader(t, s1.RPC) 884 885 // Create a node 886 n1 := mock.Node() 887 nodeReg := &structs.NodeRegisterRequest{ 888 Node: n1, 889 WriteRequest: structs.WriteRequest{Region: "global"}, 890 } 891 var nodeResp structs.NodeUpdateResponse 892 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) 893 894 // Create a batch job 895 bjob := mock.BatchJob() 896 bjob.TaskGroups[0].Count = 2 897 req := &structs.JobRegisterRequest{ 898 Job: bjob, 899 WriteRequest: structs.WriteRequest{ 900 Region: "global", 901 Namespace: bjob.Namespace, 902 }, 903 } 904 905 // Fetch the response 906 var resp structs.JobRegisterResponse 907 require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) 908 require.NotZero(resp.Index) 909 910 // Wait for the allocations to be placed 911 state := s1.State() 912 testutil.WaitForResult(func() (bool, error) { 913 allocs, err := state.AllocsByNode(nil, n1.ID) 914 if err != nil { 915 return false, err 916 } 917 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 918 }, func(err error) { 919 t.Fatalf("err: %v", err) 920 }) 921 922 // Pick the deadline 923 deadline := 0 * time.Second 924 if !inf { 925 deadline = 10 * time.Second 926 } 927 928 // Drain the node 929 drainReq := &structs.NodeUpdateDrainRequest{ 930 NodeID: n1.ID, 931 DrainStrategy: &structs.DrainStrategy{ 932 DrainSpec: structs.DrainSpec{ 933 Deadline: deadline, 934 }, 935 }, 936 WriteRequest: structs.WriteRequest{Region: "global"}, 937 } 938 var drainResp structs.NodeDrainUpdateResponse 939 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 940 941 // Wait for the allocs to be replaced 942 errCh := make(chan error, 1) 943 ctx, cancel := context.WithCancel(context.Background()) 944 defer cancel() 945 go allocPromoter(errCh, ctx, state, codec, n1.ID, s1.logger) 946 947 // Make sure the batch job isn't affected 948 testutil.AssertUntil(500*time.Millisecond, func() (bool, error) { 949 if err := checkAllocPromoter(errCh); err != nil { 950 return false, err 951 } 952 953 allocs, err := state.AllocsByNode(nil, n1.ID) 954 if err != nil { 955 return false, err 956 } 957 for _, alloc := range allocs { 958 if alloc.DesiredStatus != structs.AllocDesiredStatusRun { 959 return false, fmt.Errorf("got status %v", alloc.DesiredStatus) 960 } 961 } 962 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 963 }, func(err error) { 964 t.Fatalf("err: %v", err) 965 }) 966 967 // Foce drain the node 968 drainReq = &structs.NodeUpdateDrainRequest{ 969 NodeID: n1.ID, 970 DrainStrategy: &structs.DrainStrategy{ 971 DrainSpec: structs.DrainSpec{ 972 Deadline: -1 * time.Second, // Infinite 973 }, 974 }, 975 WriteRequest: structs.WriteRequest{Region: "global"}, 976 } 977 require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) 978 979 // Make sure the batch job is migrated 980 testutil.WaitForResult(func() (bool, error) { 981 allocs, err := state.AllocsByNode(nil, n1.ID) 982 if err != nil { 983 return false, err 984 } 985 for _, alloc := range allocs { 986 if alloc.DesiredStatus != structs.AllocDesiredStatusStop { 987 return false, fmt.Errorf("got status %v", alloc.DesiredStatus) 988 } 989 } 990 return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) 991 }, func(err error) { 992 t.Fatalf("err: %v", err) 993 }) 994 995 // Check that the node drain is removed 996 testutil.WaitForResult(func() (bool, error) { 997 node, err := state.NodeByID(nil, n1.ID) 998 if err != nil { 999 return false, err 1000 } 1001 return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") 1002 }, func(err error) { 1003 t.Fatalf("err: %v", err) 1004 }) 1005 1006 // Check we got the right events 1007 node, err := state.NodeByID(nil, n1.ID) 1008 require.NoError(err) 1009 require.Len(node.Events, 4) 1010 require.Equal(drainer.NodeDrainEventComplete, node.Events[3].Message) 1011 require.Contains(node.Events[3].Details, drainer.NodeDrainEventDetailDeadlined) 1012 }) 1013 } 1014 }