github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/integration_test.go (about) 1 // Package integration_test. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package integration_test 6 7 import ( 8 "errors" 9 "math/rand" 10 "os" 11 "path/filepath" 12 "strconv" 13 "sync" 14 "testing" 15 "time" 16 17 "github.com/NVIDIA/aistore/api" 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/cos" 21 "github.com/NVIDIA/aistore/core/meta" 22 "github.com/NVIDIA/aistore/tools" 23 "github.com/NVIDIA/aistore/tools/docker" 24 "github.com/NVIDIA/aistore/tools/readers" 25 "github.com/NVIDIA/aistore/tools/tassert" 26 "github.com/NVIDIA/aistore/tools/tlog" 27 "github.com/NVIDIA/aistore/xact" 28 ) 29 30 // Intended for a deployment with multiple targets 31 // 1. Create ais bucket 32 // 2. Unregister target T 33 // 3. PUT large amount of objects into the ais bucket 34 // 4. GET the objects while simultaneously registering the target T 35 func TestGetAndReRegisterInParallel(t *testing.T) { 36 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 37 var ( 38 m = ioContext{ 39 t: t, 40 num: 50000, 41 numGetsEachFile: 3, 42 fileSize: 10 * cos.KiB, 43 } 44 rebID string 45 ) 46 47 m.initAndSaveState(true /*cleanup*/) 48 m.expectTargets(2) 49 50 // Step 1. 51 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 52 53 // Step 2. 54 target := m.startMaintenanceNoRebalance() 55 56 // Step 3. 57 m.puts() 58 59 // Step 4. 60 wg := &sync.WaitGroup{} 61 wg.Add(2) 62 go func() { 63 // without defer, if gets crashes Done is not called resulting in test hangs 64 defer wg.Done() 65 m.gets(nil, false) 66 }() 67 68 time.Sleep(time.Second * 3) // give gets some room to breathe 69 go func() { 70 // without defer, if reregister crashes Done is not called resulting in test hangs 71 defer wg.Done() 72 rebID = m.stopMaintenance(target) 73 }() 74 wg.Wait() 75 76 m.ensureNoGetErrors() 77 m.waitAndCheckCluState() 78 tools.WaitForRebalanceByID(t, baseParams, rebID) 79 } 80 81 // All of the above PLUS proxy failover/failback sequence in parallel: 82 // 1. Create an ais bucket 83 // 2. Unregister a target 84 // 3. Crash the primary proxy and PUT in parallel 85 // 4. Failback to the original primary proxy, register target, and GET in parallel 86 func TestProxyFailbackAndReRegisterInParallel(t *testing.T) { 87 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, MinTargets: 2, MinProxies: 3}) 88 m := ioContext{ 89 t: t, 90 otherTasksToTrigger: 1, 91 num: 150000, 92 } 93 94 m.initAndSaveState(true /*cleanup*/) 95 96 // Step 1. 97 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 98 99 // Step 2. 100 target := m.startMaintenanceNoRebalance() 101 102 // Step 3. 103 _, newPrimaryURL, err := chooseNextProxy(m.smap) 104 tassert.CheckFatal(t, err) 105 // use a new proxyURL because primaryCrashElectRestart has a side-effect: 106 // it changes the primary proxy. Without the change tools.PutRandObjs is 107 // failing while the current primary is restarting and rejoining 108 m.proxyURL = newPrimaryURL 109 110 wg := &sync.WaitGroup{} 111 wg.Add(1) 112 go func() { 113 defer wg.Done() 114 killRestorePrimary(t, m.proxyURL, false, nil) 115 }() 116 117 // delay PUTs to ensure they run during primary elections 118 time.Sleep(5 * time.Second) 119 m.puts() 120 wg.Wait() 121 122 // Step 4: (three tasks) 123 wg.Add(3) 124 go func() { 125 defer wg.Done() 126 m.stopMaintenance(target) 127 }() 128 go func() { 129 defer wg.Done() 130 m.gets(nil, false) 131 }() 132 go func() { 133 defer wg.Done() 134 <-m.controlCh // <-- half GETs 135 primarySetToRand(t) 136 }() 137 wg.Wait() 138 139 xargs := xact.ArgsMsg{Kind: apc.ActRebalance, OnlyRunning: true, Timeout: tools.RebalanceTimeout} 140 _, _ = api.WaitForXactionIC(baseParams, &xargs) 141 142 // Step 5. 143 m.ensureNoGetErrors() 144 m.waitAndCheckCluState() 145 } 146 147 // Similar to TestGetAndReRegisterInParallel, but instead of unregister, we kill the target 148 // 1. Kill registered target and wait for Smap to updated 149 // 2. Create ais bucket 150 // 3. PUT large amounts of objects into ais bucket 151 // 4. Get the objects while simultaneously registering the target 152 func TestGetAndRestoreInParallel(t *testing.T) { 153 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, RequiredDeployment: tools.ClusterTypeLocal}) 154 155 var ( 156 m = ioContext{ 157 t: t, 158 num: 20000, 159 numGetsEachFile: 5, 160 fileSize: cos.KiB * 2, 161 } 162 targetNode *meta.Snode 163 ) 164 165 m.initAndSaveState(true /*cleanup*/) 166 m.expectTargets(3) 167 168 // Step 1 169 // Select a random target 170 targetNode, _ = m.smap.GetRandTarget() 171 tlog.Logf("Killing %s\n", targetNode.StringEx()) 172 tcmd, err := tools.KillNode(targetNode) 173 tassert.CheckFatal(t, err) 174 175 proxyURL := tools.RandomProxyURL(t) 176 m.smap, err = tools.WaitForClusterState(proxyURL, "target removed", m.smap.Version, m.originalProxyCount, 177 m.originalTargetCount-1) 178 tassert.CheckError(t, err) 179 180 // Step 2 181 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 182 183 // Step 3 184 m.puts() 185 186 // Step 4 187 wg := &sync.WaitGroup{} 188 wg.Add(2) 189 go func() { 190 defer wg.Done() 191 time.Sleep(4 * time.Second) 192 tools.RestoreNode(tcmd, false, "target") 193 }() 194 go func() { 195 defer wg.Done() 196 m.gets(nil, false) 197 }() 198 wg.Wait() 199 200 m.ensureNoGetErrors() 201 m.waitAndCheckCluState() 202 tools.WaitForRebalAndResil(m.t, tools.BaseAPIParams(m.proxyURL)) 203 } 204 205 func TestUnregisterPreviouslyUnregisteredTarget(t *testing.T) { 206 m := ioContext{t: t} 207 m.initAndSaveState(true /*cleanup*/) 208 m.expectTargets(1) 209 target := m.startMaintenanceNoRebalance() 210 211 // Decommission the same target again. 212 args := &apc.ActValRmNode{DaemonID: target.ID(), SkipRebalance: true} 213 _, err := api.StartMaintenance(tools.BaseAPIParams(m.proxyURL), args) 214 tassert.Errorf(t, err != nil, "error expected") 215 216 n := tools.GetClusterMap(t, m.proxyURL).CountActiveTs() 217 if n != m.originalTargetCount-1 { 218 t.Fatalf("expected %d targets after putting target in maintenance, got %d targets", 219 m.originalTargetCount-1, n) 220 } 221 222 // Register target (bring cluster to normal state) 223 rebID := m.stopMaintenance(target) 224 m.waitAndCheckCluState() 225 tools.WaitForRebalanceByID(m.t, tools.BaseAPIParams(m.proxyURL), rebID) 226 } 227 228 func TestRegisterAndUnregisterTargetAndPutInParallel(t *testing.T) { 229 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 230 231 m := ioContext{ 232 t: t, 233 num: 10000, 234 } 235 236 m.initAndSaveState(true /*cleanup*/) 237 m.expectTargets(3) 238 239 targets := m.smap.Tmap.ActiveNodes() 240 241 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 242 243 // Unregister target[0] 244 args := &apc.ActValRmNode{DaemonID: targets[0].ID(), SkipRebalance: true} 245 baseParams := tools.BaseAPIParams(m.proxyURL) 246 _, err := api.StartMaintenance(baseParams, args) 247 tassert.CheckFatal(t, err) 248 tools.WaitForClusterState( 249 m.proxyURL, 250 "put target in maintenance", 251 m.smap.Version, 252 m.originalProxyCount, 253 m.originalTargetCount-1, 254 ) 255 256 n := tools.GetClusterMap(t, m.proxyURL).CountActiveTs() 257 if n != m.originalTargetCount-1 { 258 t.Fatalf("expected %d targets after putting target in maintenance, got %d targets", 259 m.originalTargetCount-1, n) 260 } 261 262 // Do puts in parallel 263 wg := &sync.WaitGroup{} 264 wg.Add(3) 265 go func() { 266 defer wg.Done() 267 m.puts() 268 }() 269 270 // Register target 0 in parallel 271 go func() { 272 defer wg.Done() 273 args := &apc.ActValRmNode{DaemonID: targets[0].ID()} 274 tlog.Logf("Take %s out of maintenance mode ...\n", targets[0].StringEx()) 275 _, err = api.StopMaintenance(baseParams, args) 276 tassert.CheckFatal(t, err) 277 }() 278 279 // Decommission target[1] in parallel 280 go func() { 281 defer wg.Done() 282 args := &apc.ActValRmNode{DaemonID: targets[1].ID(), SkipRebalance: true} 283 _, err = api.StartMaintenance(baseParams, args) 284 tassert.CheckFatal(t, err) 285 }() 286 287 // Wait for everything to end 288 wg.Wait() 289 290 // Register target 1 to bring cluster to original state 291 rebID := m.stopMaintenance(targets[1]) 292 293 // wait for rebalance to complete 294 tools.WaitForRebalanceByID(t, baseParams, rebID) 295 296 m.waitAndCheckCluState() 297 } 298 299 func TestAckRebalance(t *testing.T) { 300 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 301 302 m := ioContext{ 303 t: t, 304 num: 30000, 305 getErrIsFatal: true, 306 } 307 308 m.initAndSaveState(true /*cleanup*/) 309 m.expectTargets(3) 310 311 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 312 313 target := m.startMaintenanceNoRebalance() 314 315 // Start putting files into bucket. 316 m.puts() 317 318 rebID := m.stopMaintenance(target) 319 320 // Wait for everything to finish. 321 baseParams := tools.BaseAPIParams(m.proxyURL) 322 tools.WaitForRebalanceByID(t, baseParams, rebID) 323 324 m.gets(nil, false) 325 326 m.ensureNoGetErrors() 327 m.waitAndCheckCluState() 328 } 329 330 func TestStressRebalance(t *testing.T) { 331 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 332 333 m := &ioContext{ 334 t: t, 335 } 336 337 m.initAndSaveState(true /*cleanup*/) 338 m.expectTargets(4) 339 340 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 341 342 for i := 1; i <= 3; i++ { 343 tlog.Logf("Iteration #%d ======\n", i) 344 testStressRebalance(t, m.bck) 345 } 346 } 347 348 func testStressRebalance(t *testing.T, bck cmn.Bck) { 349 m := &ioContext{ 350 t: t, 351 bck: bck, 352 num: 50000, 353 getErrIsFatal: true, 354 } 355 356 m.initAndSaveState(true /*cleanup*/) 357 358 tgts := m.smap.Tmap.ActiveNodes() 359 i1 := rand.Intn(len(tgts)) 360 i2 := (i1 + 1) % len(tgts) 361 target1, target2 := tgts[i1], tgts[i2] 362 363 // Unregister targets. 364 tlog.Logf("Killing %s and %s\n", target1.StringEx(), target2.StringEx()) 365 cmd1, err := tools.KillNode(target1) 366 tassert.CheckFatal(t, err) 367 time.Sleep(time.Second) 368 cmd2, err := tools.KillNode(target2) 369 tassert.CheckFatal(t, err) 370 371 // Start putting objects into bucket 372 m.puts() 373 374 // Get objects and register targets in parallel 375 wg := &sync.WaitGroup{} 376 wg.Add(1) 377 go func() { 378 defer wg.Done() 379 m.gets(nil, false) 380 }() 381 382 // and join 2 targets in parallel 383 time.Sleep(time.Second) 384 err = tools.RestoreNode(cmd1, false, "the 1st target") 385 tassert.CheckFatal(t, err) 386 387 // random sleep between the first and the second join 388 time.Sleep(time.Duration(rand.Intn(3)+1) * time.Second) 389 390 err = tools.RestoreNode(cmd2, false, "the 2nd target") 391 tassert.CheckFatal(t, err) 392 393 _, err = tools.WaitForClusterState( 394 m.proxyURL, 395 "targets to join", 396 m.smap.Version, 397 m.originalProxyCount, 398 m.originalTargetCount, 399 ) 400 tassert.CheckFatal(m.t, err) 401 402 // wait for the rebalance to finish 403 baseParams := tools.BaseAPIParams(m.proxyURL) 404 tools.WaitForRebalAndResil(t, baseParams) 405 406 // wait for the reads to run out 407 wg.Wait() 408 409 m.ensureNoGetErrors() 410 m.waitAndCheckCluState() 411 } 412 413 func TestRebalanceAfterUnregisterAndReregister(t *testing.T) { 414 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 415 m := ioContext{ 416 t: t, 417 num: 10000, 418 } 419 m.initAndSaveState(true /*cleanup*/) 420 m.expectTargets(3) 421 422 targets := m.smap.Tmap.ActiveNodes() 423 424 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 425 426 target0, target1 := targets[0], targets[1] 427 args := &apc.ActValRmNode{DaemonID: target0.ID(), SkipRebalance: true} 428 baseParams := tools.BaseAPIParams(m.proxyURL) 429 _, err := api.StartMaintenance(baseParams, args) 430 tassert.CheckFatal(t, err) 431 432 _, err = tools.WaitForClusterState( 433 m.proxyURL, 434 "put target in maintenance", 435 m.smap.Version, 436 m.originalProxyCount, 437 m.originalTargetCount-1, 438 ) 439 tassert.CheckFatal(m.t, err) 440 441 // Put some files 442 m.puts() 443 444 // Register target 0 in parallel 445 wg := &sync.WaitGroup{} 446 wg.Add(2) 447 go func() { 448 defer wg.Done() 449 tlog.Logf("Take %s out of maintenance mode ...\n", target0.StringEx()) 450 args := &apc.ActValRmNode{DaemonID: target0.ID()} 451 _, err = api.StopMaintenance(baseParams, args) 452 tassert.CheckFatal(t, err) 453 }() 454 455 // Unregister target 1 in parallel 456 go func() { 457 defer wg.Done() 458 err = tools.RemoveNodeUnsafe(m.proxyURL, target1.ID()) 459 tassert.CheckFatal(t, err) 460 }() 461 462 // Wait for everything to end 463 wg.Wait() 464 465 // Register target 1 to bring cluster to original state 466 sleep := time.Duration(rand.Intn(5))*time.Second + time.Millisecond 467 time.Sleep(sleep) 468 tlog.Logf("Join %s back\n", target1.StringEx()) 469 rebID, err := tools.JoinCluster(m.proxyURL, target1) 470 tassert.CheckFatal(t, err) 471 _, err = tools.WaitForClusterState( 472 m.proxyURL, 473 "targets to join", 474 m.smap.Version, 475 m.originalProxyCount, 476 m.originalTargetCount, 477 ) 478 tassert.CheckFatal(m.t, err) 479 480 time.Sleep(sleep) 481 tools.WaitForRebalanceByID(t, baseParams, rebID) 482 483 m.gets(nil, false) 484 485 m.ensureNoGetErrors() 486 m.waitAndCheckCluState() 487 } 488 489 func TestPutDuringRebalance(t *testing.T) { 490 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 491 492 m := ioContext{ 493 t: t, 494 num: 10000, 495 } 496 497 m.initAndSaveState(true /*cleanup*/) 498 m.expectTargets(3) 499 500 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 501 502 target := m.startMaintenanceNoRebalance() 503 504 // Start putting files and register target in parallel. 505 wg := &sync.WaitGroup{} 506 wg.Add(1) 507 go func() { 508 defer wg.Done() 509 m.puts() 510 }() 511 512 // Sleep some time to wait for PUT operations to begin. 513 time.Sleep(3 * time.Second) 514 515 rebID := m.stopMaintenance(target) 516 517 // Wait for everything to finish. 518 wg.Wait() 519 baseParams := tools.BaseAPIParams(m.proxyURL) 520 tools.WaitForRebalanceByID(t, baseParams, rebID) 521 522 // Main check - try to read all objects. 523 m.gets(nil, false) 524 525 m.checkObjectDistribution(t) 526 m.waitAndCheckCluState() 527 } 528 529 func TestGetDuringLocalAndGlobalRebalance(t *testing.T) { 530 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 531 532 var ( 533 m = ioContext{ 534 t: t, 535 num: 10000, 536 numGetsEachFile: 3, 537 } 538 baseParams = tools.BaseAPIParams() 539 selectedTarget *meta.Snode 540 killTarget *meta.Snode 541 ) 542 543 m.initAndSaveState(true /*cleanup*/) 544 m.expectTargets(2) 545 546 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 547 548 // Select a random target to disable one of its mountpaths, 549 // and another random target to unregister. 550 for _, target := range m.smap.Tmap { 551 if selectedTarget != nil { 552 killTarget = target 553 break 554 } 555 selectedTarget = target 556 } 557 mpList, err := api.GetMountpaths(baseParams, selectedTarget) 558 tassert.CheckFatal(t, err) 559 ensureNoDisabledMountpaths(t, selectedTarget, mpList) 560 561 if len(mpList.Available) < 2 { 562 t.Fatalf("Must have at least 2 mountpaths") 563 } 564 565 // Disable mountpaths temporarily 566 mpath := mpList.Available[0] 567 tlog.Logf("Disable mountpath at target %s\n", selectedTarget.ID()) 568 err = api.DisableMountpath(baseParams, selectedTarget, mpath, false /*dont-resil*/) 569 tassert.CheckFatal(t, err) 570 571 args := &apc.ActValRmNode{DaemonID: killTarget.ID(), SkipRebalance: true} 572 _, err = api.StartMaintenance(baseParams, args) 573 tassert.CheckFatal(t, err) 574 smap, err := tools.WaitForClusterState( 575 m.proxyURL, 576 "target removal", 577 m.smap.Version, 578 m.originalProxyCount, 579 m.originalTargetCount-1, 580 ) 581 tassert.CheckFatal(m.t, err) 582 583 m.puts() 584 585 // Start getting objects 586 wg := &sync.WaitGroup{} 587 wg.Add(1) 588 go func() { 589 defer wg.Done() 590 m.gets(nil, false) 591 }() 592 593 // Let's give gets some momentum 594 time.Sleep(time.Second * 4) 595 596 // register a new target 597 args = &apc.ActValRmNode{DaemonID: killTarget.ID()} 598 _, err = api.StopMaintenance(baseParams, args) 599 tassert.CheckFatal(t, err) 600 601 // enable mountpath 602 err = api.EnableMountpath(baseParams, selectedTarget, mpath) 603 tassert.CheckFatal(t, err) 604 605 // wait until GETs are done while 2 rebalance are running 606 wg.Wait() 607 608 // make sure that the cluster has all targets enabled 609 _, err = tools.WaitForClusterState( 610 m.proxyURL, 611 "target joined back", 612 smap.Version, 613 m.originalProxyCount, 614 m.originalTargetCount, 615 ) 616 tassert.CheckFatal(m.t, err) 617 618 // wait for rebalance to complete 619 baseParams = tools.BaseAPIParams(m.proxyURL) 620 tools.WaitForRebalAndResil(t, baseParams) 621 622 m.ensureNoGetErrors() 623 m.waitAndCheckCluState() 624 m.ensureNumMountpaths(selectedTarget, mpList) 625 } 626 627 func TestGetDuringResilver(t *testing.T) { 628 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 629 630 var ( 631 m = ioContext{ 632 t: t, 633 num: 20000, 634 } 635 baseParams = tools.BaseAPIParams() 636 ) 637 638 m.initAndSaveState(true /*cleanup*/) 639 m.expectTargets(1) 640 641 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 642 643 target, _ := m.smap.GetRandTarget() 644 mpList, err := api.GetMountpaths(baseParams, target) 645 tassert.CheckFatal(t, err) 646 ensureNoDisabledMountpaths(t, target, mpList) 647 648 if len(mpList.Available) < 2 { 649 t.Fatalf("Must have at least 2 mountpaths") 650 } 651 652 // select up to 2 mountpath 653 mpaths := []string{mpList.Available[0]} 654 if len(mpList.Available) > 2 { 655 mpaths = append(mpaths, mpList.Available[1]) 656 } 657 658 // Disable mountpaths temporarily 659 for _, mp := range mpaths { 660 err = api.DisableMountpath(baseParams, target, mp, false /*dont-resil*/) 661 tassert.CheckFatal(t, err) 662 } 663 664 m.puts() 665 666 // Start getting objects and enable mountpaths in parallel 667 wg := &sync.WaitGroup{} 668 wg.Add(1) 669 go func() { 670 defer wg.Done() 671 m.getsUntilStop() 672 }() 673 674 for _, mp := range mpaths { 675 time.Sleep(time.Second) 676 err = api.EnableMountpath(baseParams, target, mp) 677 tassert.CheckFatal(t, err) 678 } 679 m.stopGets() 680 681 wg.Wait() 682 time.Sleep(2 * time.Second) 683 684 tlog.Logf("Wait for rebalance (when target %s that has previously lost all mountpaths joins back)\n", target.StringEx()) 685 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 686 _, _ = api.WaitForXactionIC(baseParams, &args) 687 688 tools.WaitForResilvering(t, baseParams, nil) 689 690 m.ensureNoGetErrors() 691 m.ensureNumMountpaths(target, mpList) 692 } 693 694 func TestGetDuringRebalance(t *testing.T) { 695 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 696 697 m := ioContext{ 698 t: t, 699 num: 30000, 700 } 701 702 m.initAndSaveState(true /*cleanup*/) 703 m.expectTargets(3) 704 705 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 706 707 target := m.startMaintenanceNoRebalance() 708 709 m.puts() 710 711 // Start getting objects and register target in parallel. 712 wg := &sync.WaitGroup{} 713 wg.Add(1) 714 go func() { 715 defer wg.Done() 716 m.gets(nil, false) 717 }() 718 719 rebID := m.stopMaintenance(target) 720 721 // Wait for everything to finish. 722 baseParams := tools.BaseAPIParams(m.proxyURL) 723 tools.WaitForRebalanceByID(t, baseParams, rebID) 724 wg.Wait() 725 726 // Get objects once again to check if they are still accessible after rebalance. 727 m.gets(nil, false) 728 729 m.ensureNoGetErrors() 730 m.waitAndCheckCluState() 731 } 732 733 func TestRegisterTargetsAndCreateBucketsInParallel(t *testing.T) { 734 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 735 736 const ( 737 unregisterTargetCount = 2 738 newBucketCount = 3 739 ) 740 741 m := ioContext{ 742 t: t, 743 } 744 745 m.initAndSaveState(true /*cleanup*/) 746 m.expectTargets(3) 747 748 targets := m.smap.Tmap.ActiveNodes() 749 baseParams := tools.BaseAPIParams(m.proxyURL) 750 751 // Decommission targets 752 for i := range unregisterTargetCount { 753 args := &apc.ActValRmNode{DaemonID: targets[i].ID(), SkipRebalance: true} 754 _, err := api.StartMaintenance(baseParams, args) 755 tassert.CheckError(t, err) 756 } 757 tools.WaitForClusterState( 758 m.proxyURL, 759 "remove targets", 760 m.smap.Version, 761 m.originalProxyCount, 762 m.originalTargetCount-unregisterTargetCount, 763 ) 764 765 wg := &sync.WaitGroup{} 766 wg.Add(unregisterTargetCount) 767 for i := range unregisterTargetCount { 768 go func(number int) { 769 defer wg.Done() 770 args := &apc.ActValRmNode{DaemonID: targets[number].ID()} 771 _, err := api.StopMaintenance(baseParams, args) 772 tassert.CheckError(t, err) 773 }(i) 774 } 775 776 wg.Add(newBucketCount) 777 for i := range newBucketCount { 778 bck := m.bck 779 bck.Name += strconv.Itoa(i) 780 781 go func() { 782 defer wg.Done() 783 tools.CreateBucket(t, m.proxyURL, bck, nil, true /*cleanup*/) 784 }() 785 } 786 wg.Wait() 787 m.waitAndCheckCluState() 788 tools.WaitForRebalAndResil(t, baseParams) 789 } 790 791 func TestMountpathDetachAll(t *testing.T) { 792 if true { 793 t.Skipf("skipping %s", t.Name()) // TODO -- FIXME: add back, here and elsewhere 794 } 795 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, MinTargets: 2}) 796 797 var ( 798 m = ioContext{ 799 t: t, 800 num: 5000, 801 numGetsEachFile: 2, 802 } 803 baseParams = tools.BaseAPIParams() 804 ) 805 806 m.initAndSaveState(true /*cleanup*/) 807 m.expectTargets(2) 808 809 target, _ := m.smap.GetRandTarget() 810 tname := target.StringEx() 811 origMountpaths, err := api.GetMountpaths(baseParams, target) 812 tassert.CheckFatal(t, err) 813 ensureNoDisabledMountpaths(t, target, origMountpaths) 814 815 // Remove all mountpaths on the target 816 for _, mpath := range origMountpaths.Available { 817 err = api.DetachMountpath(baseParams, target, mpath, false /*dont-resil*/) 818 tassert.CheckFatal(t, err) 819 } 820 821 time.Sleep(time.Second) 822 tlog.Logf("Wait for rebalance (triggered by %s leaving the cluster after having lost all mountpaths)\n", tname) 823 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 824 _, _ = api.WaitForXactionIC(baseParams, &args) 825 826 // Check if mountpaths were actually removed 827 mountpaths, err := api.GetMountpaths(baseParams, target) 828 tassert.CheckFatal(t, err) 829 830 if len(mountpaths.Available) != 0 { 831 t.Fatalf("%s should not have any paths available: %d", tname, len(mountpaths.Available)) 832 } 833 834 // Create ais bucket 835 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 836 837 // Add target mountpath again 838 for _, mpath := range origMountpaths.Available { 839 err = api.AttachMountpath(baseParams, target, mpath) 840 tassert.CheckFatal(t, err) 841 } 842 843 time.Sleep(2 * time.Second) 844 tlog.Logf("Wait for rebalance (when target %s that has previously lost all mountpaths joins back)\n", target.StringEx()) 845 args = xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 846 _, _ = api.WaitForXactionIC(baseParams, &args) 847 848 tools.WaitForResilvering(t, baseParams, target) 849 850 // random read/write 851 m.puts() 852 m.gets(nil, false) 853 854 m.ensureNoGetErrors() 855 m.ensureNumMountpaths(target, origMountpaths) 856 } 857 858 func TestResilverAfterAddingMountpath(t *testing.T) { 859 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 860 var ( 861 m = ioContext{ 862 t: t, 863 num: 5000, 864 numGetsEachFile: 2, 865 } 866 baseParams = tools.BaseAPIParams() 867 ) 868 869 m.initAndSaveState(true /*cleanup*/) 870 m.expectTargets(1) 871 target, _ := m.smap.GetRandTarget() 872 mpList, err := api.GetMountpaths(baseParams, target) 873 tassert.CheckFatal(t, err) 874 ensureNoDisabledMountpaths(t, target, mpList) 875 876 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 877 878 if docker.IsRunning() { 879 err := docker.CreateMpathDir(0, testMpath) 880 tassert.CheckFatal(t, err) 881 } else { 882 err := cos.CreateDir(testMpath) 883 tassert.CheckFatal(t, err) 884 } 885 886 defer func() { 887 if !docker.IsRunning() { 888 os.RemoveAll(testMpath) 889 } 890 }() 891 892 m.puts() 893 894 // Add new mountpath to target 895 tlog.Logf("attach new %q at target %s\n", testMpath, target.StringEx()) 896 err = api.AttachMountpath(baseParams, target, testMpath) 897 tassert.CheckFatal(t, err) 898 899 tools.WaitForResilvering(t, baseParams, target) 900 901 m.gets(nil, false) 902 903 // Remove new mountpath from target 904 tlog.Logf("detach %q from target %s\n", testMpath, target.StringEx()) 905 if docker.IsRunning() { 906 if err := api.DetachMountpath(baseParams, target, testMpath, false /*dont-resil*/); err != nil { 907 t.Error(err.Error()) 908 } 909 } else { 910 err = api.DetachMountpath(baseParams, target, testMpath, false /*dont-resil*/) 911 tassert.CheckFatal(t, err) 912 } 913 914 m.ensureNoGetErrors() 915 916 tools.WaitForResilvering(t, baseParams, target) 917 m.ensureNumMountpaths(target, mpList) 918 } 919 920 func TestAttachDetachMountpathAllTargets(t *testing.T) { 921 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 922 var ( 923 m = ioContext{ 924 t: t, 925 num: 10000, 926 numGetsEachFile: 5, 927 } 928 baseParams = tools.BaseAPIParams() 929 930 allMps = make(map[string]*apc.MountpathList) 931 ) 932 933 m.initAndSaveState(true /*cleanup*/) 934 m.expectTargets(1) 935 936 targets := m.smap.Tmap.ActiveNodes() 937 938 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 939 940 defer func() { 941 if !docker.IsRunning() { 942 os.RemoveAll(testMpath) 943 } 944 }() 945 946 // PUT random objects 947 m.puts() 948 949 if docker.IsRunning() { 950 err := docker.CreateMpathDir(0, testMpath) 951 tassert.CheckFatal(t, err) 952 for _, target := range targets { 953 mpList, err := api.GetMountpaths(baseParams, target) 954 tassert.CheckFatal(t, err) 955 allMps[target.ID()] = mpList 956 957 err = api.AttachMountpath(baseParams, target, testMpath) 958 tassert.CheckFatal(t, err) 959 } 960 } else { 961 // Add new mountpath to all targets 962 for idx, target := range targets { 963 mpList, err := api.GetMountpaths(baseParams, target) 964 tassert.CheckFatal(t, err) 965 allMps[target.ID()] = mpList 966 967 mountpath := filepath.Join(testMpath, strconv.Itoa(idx)) 968 cos.CreateDir(mountpath) 969 err = api.AttachMountpath(baseParams, target, mountpath) 970 tassert.CheckFatal(t, err) 971 } 972 } 973 974 tools.WaitForResilvering(t, baseParams, nil) 975 976 // Read after rebalance 977 m.gets(nil, false) 978 979 // Remove new mountpath from all targets 980 if docker.IsRunning() { 981 err := docker.RemoveMpathDir(0, testMpath) 982 tassert.CheckFatal(t, err) 983 for _, target := range targets { 984 if err := api.DetachMountpath(baseParams, target, testMpath, false /*dont-resil*/); err != nil { 985 t.Error(err.Error()) 986 } 987 } 988 } else { 989 for idx, target := range targets { 990 mountpath := filepath.Join(testMpath, strconv.Itoa(idx)) 991 os.RemoveAll(mountpath) 992 if err := api.DetachMountpath(baseParams, target, mountpath, false /*dont-resil*/); err != nil { 993 t.Error(err.Error()) 994 } 995 } 996 } 997 998 tools.WaitForResilvering(t, baseParams, nil) 999 1000 m.ensureNoGetErrors() 1001 for _, target := range targets { 1002 m.ensureNumMountpaths(target, allMps[target.ID()]) 1003 } 1004 } 1005 1006 func TestMountpathDisableAll(t *testing.T) { 1007 var ( 1008 m = ioContext{ 1009 t: t, 1010 num: 5000, 1011 numGetsEachFile: 2, 1012 } 1013 baseParams = tools.BaseAPIParams() 1014 ) 1015 1016 m.initAndSaveState(true /*cleanup*/) 1017 m.expectTargets(1) 1018 1019 // Remove all mountpaths on the target 1020 target, _ := m.smap.GetRandTarget() 1021 tname := target.StringEx() 1022 origMountpaths, err := api.GetMountpaths(baseParams, target) 1023 tassert.CheckFatal(t, err) 1024 ensureNoDisabledMountpaths(t, target, origMountpaths) 1025 1026 if len(origMountpaths.WaitingDD) != 0 || len(origMountpaths.Disabled) != 0 { 1027 tlog.Logf("Warning %s: orig mountpaths (avail=%d, dd=%d, disabled=%d)\n", tname, 1028 len(origMountpaths.Available), len(origMountpaths.WaitingDD), len(origMountpaths.Disabled)) 1029 for _, mpath := range origMountpaths.Disabled { 1030 err = api.EnableMountpath(baseParams, target, mpath) 1031 tlog.Logf("Warning %s: late enable %q, err=%v\n", tname, mpath, err) 1032 time.Sleep(2 * time.Second) 1033 } 1034 origMountpaths, err = api.GetMountpaths(baseParams, target) 1035 tassert.CheckFatal(t, err) 1036 } else { 1037 tlog.Logf("%s: orig avail mountpaths=%d\n", tname, len(origMountpaths.Available)) 1038 } 1039 disabled := make(cos.StrSet) 1040 defer func() { 1041 for mpath := range disabled { 1042 err := api.EnableMountpath(baseParams, target, mpath) 1043 tassert.CheckError(t, err) 1044 } 1045 if len(disabled) != 0 { 1046 tlog.Logf("Wait for rebalance (when target %s that has previously lost all mountpaths joins back)\n", 1047 tname) 1048 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 1049 _, _ = api.WaitForXactionIC(baseParams, &args) 1050 1051 tools.WaitForResilvering(t, baseParams, nil) 1052 } 1053 }() 1054 for _, mpath := range origMountpaths.Available { 1055 err := api.DisableMountpath(baseParams, target, mpath, true /*dont-resil*/) 1056 tassert.CheckFatal(t, err) 1057 disabled.Add(mpath) 1058 } 1059 1060 time.Sleep(2 * time.Second) 1061 tlog.Logf("Wait for rebalance (triggered by %s leaving the cluster after having lost all mountpaths)\n", tname) 1062 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 1063 _, _ = api.WaitForXactionIC(baseParams, &args) 1064 1065 // Check if mountpaths were actually disabled 1066 time.Sleep(time.Second) 1067 mountpaths, err := api.GetMountpaths(baseParams, target) 1068 tassert.CheckFatal(t, err) 1069 1070 if len(mountpaths.Available) != 0 { 1071 t.Fatalf("%s should not have any mountpaths left (%d)", tname, len(mountpaths.Available)) 1072 } 1073 if len(mountpaths.Disabled)+len(mountpaths.WaitingDD) != len(origMountpaths.Available) { 1074 t.Fatalf("%s: not all mountpaths were disabled (%d, %d, %d)", tname, 1075 len(mountpaths.Disabled), len(mountpaths.WaitingDD), len(origMountpaths.Available)) 1076 } 1077 1078 // Create ais bucket 1079 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1080 1081 // Re-enable target mountpaths 1082 for _, mpath := range origMountpaths.Available { 1083 err := api.EnableMountpath(baseParams, target, mpath) 1084 tassert.CheckFatal(t, err) 1085 disabled.Delete(mpath) 1086 } 1087 1088 time.Sleep(2 * time.Second) 1089 tlog.Logf("Wait for rebalance (when target %s that has previously lost all mountpaths joins back)\n", target.StringEx()) 1090 args = xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 1091 _, _ = api.WaitForXactionIC(baseParams, &args) 1092 1093 tools.WaitForResilvering(t, baseParams, target) 1094 1095 tlog.Logf("waiting for bucket %s to show up on all targets\n", m.bck) 1096 err = checkTargetBMDsFor(m.proxyURL, m.bck) 1097 tassert.CheckFatal(t, err) 1098 1099 // Put and read random files 1100 m.puts() 1101 m.gets(nil, false) 1102 1103 m.ensureNoGetErrors() 1104 m.ensureNumMountpaths(target, origMountpaths) 1105 } 1106 1107 // get BMD from each target; check the BMD for the specified bucket 1108 func checkTargetBMDsFor(proxyURL string, bck cmn.Bck) error { 1109 bp := tools.BaseAPIParams(proxyURL) 1110 smap, err := api.GetClusterMap(bp) 1111 if err != nil { 1112 return err 1113 } 1114 to := time.Now().Add(10 * time.Second) 1115 b := meta.CloneBck(&bck) 1116 for tid := range smap.Tmap { 1117 // poll 1118 for { 1119 // alternatively, something like: api.GetBMD(tools.BaseAPIParams(tsi.URL(...))) 1120 val, err := api.GetNodeMeta(bp, tid, apc.WhatBMD) 1121 if err != nil { 1122 return err 1123 } 1124 bmd := val.(*meta.BMD) 1125 if _, bucketExists := bmd.Get(b); bucketExists { 1126 break 1127 } 1128 if time.Now().After(to) { 1129 return errors.New("checkBMDsFor: timeout") 1130 } 1131 time.Sleep(time.Second) 1132 } 1133 } 1134 return nil 1135 } 1136 1137 func TestForwardCP(t *testing.T) { 1138 m := ioContext{ 1139 t: t, 1140 num: 10000, 1141 numGetsEachFile: 2, 1142 fileSize: 128, 1143 } 1144 1145 // Step 1. 1146 m.initAndSaveState(true /*cleanup*/) 1147 m.expectProxies(2) 1148 1149 // Step 2. 1150 origID, origURL := m.smap.Primary.ID(), m.smap.Primary.PubNet.URL 1151 nextProxyID, nextProxyURL, err := chooseNextProxy(m.smap) 1152 tassert.CheckFatal(t, err) 1153 1154 t.Cleanup(func() { 1155 // Restore original primary. 1156 m.smap = tools.GetClusterMap(m.t, m.proxyURL) 1157 setPrimaryTo(t, m.proxyURL, m.smap, origURL, origID) 1158 1159 time.Sleep(time.Second) 1160 }) 1161 1162 tools.CreateBucket(t, nextProxyURL, m.bck, nil, true /*cleanup*/) 1163 tlog.Logf("Created bucket %s via non-primary %s\n", m.bck, nextProxyID) 1164 1165 // Step 3. 1166 m.puts() 1167 1168 // Step 4. in parallel: run GETs and designate a new primary=nextProxyID 1169 wg := &sync.WaitGroup{} 1170 wg.Add(2) 1171 go func() { 1172 defer wg.Done() 1173 m.gets(nil, false) 1174 }() 1175 go func() { 1176 defer wg.Done() 1177 1178 setPrimaryTo(t, m.proxyURL, m.smap, nextProxyURL, nextProxyID) 1179 m.proxyURL = nextProxyURL 1180 }() 1181 wg.Wait() 1182 1183 m.ensureNoGetErrors() 1184 1185 // Step 5. destroy ais bucket via original primary which is not primary at this point 1186 tools.DestroyBucket(t, origURL, m.bck) 1187 tlog.Logf("Destroyed bucket %s via non-primary %s/%s\n", m.bck, origID, origURL) 1188 } 1189 1190 func TestAtimeRebalance(t *testing.T) { 1191 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1192 1193 m := ioContext{ 1194 t: t, 1195 num: 2000, 1196 numGetsEachFile: 2, 1197 } 1198 1199 m.initAndSaveState(true /*cleanup*/) 1200 m.expectTargets(2) 1201 1202 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1203 1204 target := m.startMaintenanceNoRebalance() 1205 1206 m.puts() 1207 1208 // Get atime in a format that includes nanoseconds to properly check if it 1209 // was updated in atime cache (if it wasn't, then the returned atime would 1210 // be different from the original one, but the difference could be very small). 1211 msg := &apc.LsoMsg{TimeFormat: time.StampNano} 1212 msg.AddProps(apc.GetPropsAtime, apc.GetPropsStatus) 1213 baseParams := tools.BaseAPIParams(m.proxyURL) 1214 lst, err := api.ListObjects(baseParams, m.bck, msg, api.ListArgs{}) 1215 tassert.CheckFatal(t, err) 1216 1217 objNames := make(cos.StrKVs, 10) 1218 for _, en := range lst.Entries { 1219 objNames[en.Name] = en.Atime 1220 } 1221 1222 rebID := m.stopMaintenance(target) 1223 1224 // make sure that the cluster has all targets enabled 1225 _, err = tools.WaitForClusterState( 1226 m.proxyURL, 1227 "target joined back", 1228 m.smap.Version, 1229 m.originalProxyCount, 1230 m.originalTargetCount, 1231 ) 1232 tassert.CheckFatal(t, err) 1233 1234 tools.WaitForRebalanceByID(t, baseParams, rebID) 1235 1236 msg = &apc.LsoMsg{TimeFormat: time.StampNano} 1237 msg.AddProps(apc.GetPropsAtime, apc.GetPropsStatus) 1238 lstReb, err := api.ListObjects(baseParams, m.bck, msg, api.ListArgs{}) 1239 tassert.CheckFatal(t, err) 1240 1241 itemCount, itemCountOk := len(lstReb.Entries), 0 1242 l := len(lst.Entries) 1243 if itemCount != l { 1244 t.Errorf("The number of objects mismatch: before %d, after %d", len(lst.Entries), itemCount) 1245 } 1246 for _, en := range lstReb.Entries { 1247 atime, ok := objNames[en.Name] 1248 if !ok { 1249 t.Errorf("Object %q not found", en.Name) 1250 continue 1251 } 1252 if atime != en.Atime { 1253 t.Errorf("Atime mismatched for %s: before %q, after %q", en.Name, atime, en.Atime) 1254 } 1255 if en.IsStatusOK() { 1256 itemCountOk++ 1257 } 1258 } 1259 if itemCountOk != l { 1260 t.Errorf("Wrong number of objects with status OK: %d (expecting %d)", itemCountOk, l) 1261 } 1262 } 1263 1264 func TestAtimeLocalGet(t *testing.T) { 1265 var ( 1266 bck = cmn.Bck{ 1267 Name: t.Name(), 1268 Provider: apc.AIS, 1269 } 1270 proxyURL = tools.RandomProxyURL(t) 1271 baseParams = tools.BaseAPIParams(proxyURL) 1272 objectName = t.Name() 1273 objectContent = readers.NewBytes([]byte("file content")) 1274 ) 1275 1276 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 1277 1278 _, err := api.PutObject(&api.PutArgs{BaseParams: baseParams, Bck: bck, ObjName: objectName, Reader: objectContent}) 1279 tassert.CheckFatal(t, err) 1280 1281 putAtime, putAtimeFormatted := tools.GetObjectAtime(t, baseParams, bck, objectName, time.RFC3339Nano) 1282 1283 // Get object so that atime is updated 1284 _, err = api.GetObject(baseParams, bck, objectName, nil) 1285 tassert.CheckFatal(t, err) 1286 1287 getAtime, getAtimeFormatted := tools.GetObjectAtime(t, baseParams, bck, objectName, time.RFC3339Nano) 1288 1289 if !(getAtime.After(putAtime)) { 1290 t.Errorf("Expected PUT atime (%s) to be before GET atime (%s)", putAtimeFormatted, getAtimeFormatted) 1291 } 1292 } 1293 1294 func TestAtimeColdGet(t *testing.T) { 1295 var ( 1296 bck = cliBck 1297 proxyURL = tools.RandomProxyURL(t) 1298 baseParams = tools.BaseAPIParams(proxyURL) 1299 objectName = t.Name() 1300 objectContent = readers.NewBytes([]byte("dummy content")) 1301 ) 1302 1303 tools.CheckSkip(t, &tools.SkipTestArgs{RemoteBck: true, Bck: bck}) 1304 api.DeleteObject(baseParams, bck, objectName) 1305 defer api.DeleteObject(baseParams, bck, objectName) 1306 1307 tools.PutObjectInRemoteBucketWithoutCachingLocally(t, bck, objectName, objectContent) 1308 1309 timeAfterPut := time.Now() 1310 1311 // Perform the COLD get 1312 oah, err := api.GetObject(baseParams, bck, objectName, nil) 1313 tassert.CheckFatal(t, err) 1314 1315 tlog.Logf("%+v\n", oah) // DEBUG 1316 1317 getAtime, getAtimeFormatted := tools.GetObjectAtime(t, baseParams, bck, objectName, time.RFC3339Nano) 1318 tassert.Fatalf(t, !getAtime.IsZero(), "GET atime is zero") 1319 1320 if !(getAtime.After(timeAfterPut)) { 1321 t.Errorf("Expected PUT atime (%s) to be before GET atime (%s)", timeAfterPut.Format(time.RFC3339Nano), getAtimeFormatted) 1322 } 1323 } 1324 1325 func TestAtimePrefetch(t *testing.T) { 1326 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1327 1328 var ( 1329 bck = cliBck 1330 proxyURL = tools.RandomProxyURL(t) 1331 baseParams = tools.BaseAPIParams(proxyURL) 1332 objectName = t.Name() 1333 numObjs = 10 1334 objPath = "atime/obj-" 1335 errCh = make(chan error, numObjs) 1336 nameCh = make(chan string, numObjs) 1337 objs = make([]string, 0, numObjs) 1338 ) 1339 1340 tools.CheckSkip(t, &tools.SkipTestArgs{RemoteBck: true, Bck: bck}) 1341 api.DeleteObject(baseParams, bck, objectName) 1342 defer func() { 1343 for _, obj := range objs { 1344 api.DeleteObject(baseParams, bck, obj) 1345 } 1346 }() 1347 1348 wg := &sync.WaitGroup{} 1349 for i := range numObjs { 1350 wg.Add(1) 1351 go func(idx int) { 1352 defer wg.Done() 1353 object := objPath + strconv.FormatUint(uint64(idx), 10) 1354 _, err := api.PutObject(&api.PutArgs{ 1355 BaseParams: baseParams, 1356 Bck: bck, 1357 ObjName: object, 1358 Reader: readers.NewBytes([]byte("dummy content")), 1359 }) 1360 if err == nil { 1361 nameCh <- object 1362 } else { 1363 errCh <- err 1364 } 1365 }(i) 1366 } 1367 wg.Wait() 1368 close(errCh) 1369 close(nameCh) 1370 tassert.SelectErr(t, errCh, "put", true) 1371 for obj := range nameCh { 1372 objs = append(objs, obj) 1373 } 1374 xid, err := api.EvictMultiObj(baseParams, bck, objs, "" /*template*/) 1375 tassert.CheckFatal(t, err) 1376 args := xact.ArgsMsg{ID: xid, Timeout: tools.RebalanceTimeout} 1377 _, err = api.WaitForXactionIC(baseParams, &args) 1378 tassert.CheckFatal(t, err) 1379 1380 timeAfterPut := time.Now() 1381 1382 { 1383 var msg apc.PrefetchMsg 1384 msg.ObjNames = objs 1385 xid, err = api.Prefetch(baseParams, bck, msg) 1386 tassert.CheckFatal(t, err) 1387 args = xact.ArgsMsg{ID: xid, Kind: apc.ActPrefetchObjects, Timeout: tools.RebalanceTimeout} 1388 _, err = api.WaitForXactionIC(baseParams, &args) 1389 tassert.CheckFatal(t, err) 1390 } 1391 1392 timeFormat := time.RFC3339Nano 1393 msg := &apc.LsoMsg{Props: apc.GetPropsAtime, TimeFormat: timeFormat, Prefix: objPath} 1394 lst, err := api.ListObjects(baseParams, bck, msg, api.ListArgs{}) 1395 tassert.CheckFatal(t, err) 1396 if len(lst.Entries) != numObjs { 1397 t.Errorf("Number of objects mismatch: expected %d, found %d", numObjs, len(lst.Entries)) 1398 } 1399 for _, en := range lst.Entries { 1400 atime, err := time.Parse(timeFormat, en.Atime) 1401 tassert.CheckFatal(t, err) 1402 if atime.After(timeAfterPut) { 1403 t.Errorf("Atime should not be updated after prefetch (got: atime after PUT: %s, atime after GET: %s).", 1404 timeAfterPut.Format(timeFormat), atime.Format(timeFormat)) 1405 } 1406 } 1407 } 1408 1409 func TestAtimeLocalPut(t *testing.T) { 1410 var ( 1411 bck = cmn.Bck{ 1412 Name: t.Name(), 1413 Provider: apc.AIS, 1414 } 1415 proxyURL = tools.RandomProxyURL(t) 1416 baseParams = tools.BaseAPIParams(proxyURL) 1417 objectName = t.Name() 1418 objectContent = readers.NewBytes([]byte("dummy content")) 1419 ) 1420 1421 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 1422 1423 timeBeforePut := time.Now() 1424 _, err := api.PutObject(&api.PutArgs{BaseParams: baseParams, Bck: bck, ObjName: objectName, Reader: objectContent}) 1425 tassert.CheckFatal(t, err) 1426 1427 putAtime, putAtimeFormatted := tools.GetObjectAtime(t, baseParams, bck, objectName, time.RFC3339Nano) 1428 1429 if !(putAtime.After(timeBeforePut)) { 1430 t.Errorf("Expected atime after PUT (%s) to be after atime before PUT (%s)", 1431 putAtimeFormatted, timeBeforePut.Format(time.RFC3339Nano)) 1432 } 1433 } 1434 1435 // 1. Unregister target 1436 // 2. Add bucket - unregistered target should miss the update 1437 // 3. Reregister target 1438 // 4. Put objects 1439 // 5. Get objects - everything should succeed 1440 func TestGetAndPutAfterReregisterWithMissedBucketUpdate(t *testing.T) { 1441 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1442 1443 m := ioContext{ 1444 t: t, 1445 num: 10000, 1446 numGetsEachFile: 5, 1447 } 1448 1449 m.initAndSaveState(true /*cleanup*/) 1450 m.expectTargets(2) 1451 1452 target := m.startMaintenanceNoRebalance() 1453 1454 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1455 1456 rebID := m.stopMaintenance(target) 1457 1458 m.puts() 1459 m.gets(nil, false) 1460 1461 m.ensureNoGetErrors() 1462 m.waitAndCheckCluState() 1463 baseParams := tools.BaseAPIParams(m.proxyURL) 1464 tools.WaitForRebalanceByID(t, baseParams, rebID) 1465 } 1466 1467 // 1. Unregister target 1468 // 2. Add bucket - unregistered target should miss the update 1469 // 3. Put objects 1470 // 4. Reregister target - rebalance kicks in 1471 // 5. Get objects - everything should succeed 1472 func TestGetAfterReregisterWithMissedBucketUpdate(t *testing.T) { 1473 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1474 1475 m := ioContext{ 1476 t: t, 1477 num: 10000, 1478 fileSize: 1024, 1479 numGetsEachFile: 5, 1480 } 1481 1482 // Initialize ioContext 1483 m.initAndSaveState(true /*cleanup*/) 1484 m.expectTargets(2) 1485 1486 targets := m.smap.Tmap.ActiveNodes() 1487 1488 // Unregister target[0] 1489 args := &apc.ActValRmNode{DaemonID: targets[0].ID(), SkipRebalance: true} 1490 _, err := api.StartMaintenance(tools.BaseAPIParams(m.proxyURL), args) 1491 tassert.CheckFatal(t, err) 1492 tools.WaitForClusterState( 1493 m.proxyURL, 1494 "remove target", 1495 m.smap.Version, 1496 m.originalProxyCount, 1497 m.originalTargetCount-1, 1498 ) 1499 1500 // Create ais bucket 1501 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1502 1503 m.puts() 1504 1505 // Reregister target 0 1506 rebID := m.stopMaintenance(targets[0]) 1507 1508 // Wait for rebalance and execute GETs 1509 baseParams := tools.BaseAPIParams(m.proxyURL) 1510 tools.WaitForRebalanceByID(t, baseParams, rebID) 1511 1512 m.gets(nil, false) 1513 1514 m.ensureNoGetErrors() 1515 m.waitAndCheckCluState() 1516 } 1517 1518 func TestRenewRebalance(t *testing.T) { 1519 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1520 1521 var ( 1522 m = ioContext{ 1523 t: t, 1524 num: 10000, 1525 numGetsEachFile: 5, 1526 otherTasksToTrigger: 1, 1527 } 1528 rebID string 1529 ) 1530 1531 m.initAndSaveState(true /*cleanup*/) 1532 m.expectTargets(2) 1533 1534 // Step 1: Unregister a target 1535 target := m.startMaintenanceNoRebalance() 1536 1537 // Step 2: Create an ais bucket 1538 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1539 1540 // Step 3: PUT objects in the bucket 1541 m.puts() 1542 1543 baseParams := tools.BaseAPIParams(m.proxyURL) 1544 1545 // Step 4: Re-register target (triggers rebalance) 1546 m.stopMaintenance(target) 1547 xargs := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceStartTimeout} 1548 err := api.WaitForXactionNode(baseParams, &xargs, xactSnapRunning) 1549 tassert.CheckError(t, err) 1550 tlog.Logf("rebalance started\n") 1551 1552 wg := &sync.WaitGroup{} 1553 wg.Add(2) 1554 // Step 5: GET objects from the buket 1555 go func() { 1556 defer wg.Done() 1557 m.gets(nil, false) 1558 }() 1559 1560 // Step 6: 1561 // - Start new rebalance manually after some time 1562 // - TODO: Verify that new rebalance xaction has started 1563 go func() { 1564 defer wg.Done() 1565 1566 <-m.controlCh // wait for half the GETs to complete 1567 1568 rebID, err = api.StartXaction(baseParams, &xact.ArgsMsg{Kind: apc.ActRebalance}, "") 1569 tassert.CheckFatal(t, err) 1570 tlog.Logf("manually initiated rebalance\n") 1571 }() 1572 1573 wg.Wait() 1574 args := xact.ArgsMsg{ID: rebID, Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 1575 _, err = api.WaitForXactionIC(baseParams, &args) 1576 tassert.CheckError(t, err) 1577 1578 m.ensureNoGetErrors() 1579 m.waitAndCheckCluState() 1580 } 1581 1582 func TestGetFromMirroredWithLostOneMountpath(t *testing.T) { 1583 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1584 var ( 1585 copies = 2 1586 m = ioContext{ 1587 t: t, 1588 num: 5000, 1589 numGetsEachFile: 4, 1590 } 1591 baseParams = tools.BaseAPIParams() 1592 ) 1593 1594 m.initAndSaveState(true /*cleanup*/) 1595 m.expectTargets(1) 1596 1597 // Select one target at random 1598 target, _ := m.smap.GetRandTarget() 1599 mpList, err := api.GetMountpaths(baseParams, target) 1600 tassert.CheckFatal(t, err) 1601 ensureNoDisabledMountpaths(t, target, mpList) 1602 if len(mpList.Available) < copies { 1603 t.Fatalf("%s requires at least %d mountpaths per target", t.Name(), copies) 1604 } 1605 1606 // Step 1: Create a local bucket 1607 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1608 1609 // Step 2: Make the bucket redundant 1610 _, err = api.SetBucketProps(baseParams, m.bck, &cmn.BpropsToSet{ 1611 Mirror: &cmn.MirrorConfToSet{ 1612 Enabled: apc.Ptr(true), 1613 Copies: apc.Ptr(int64(copies)), 1614 }, 1615 }) 1616 if err != nil { 1617 t.Fatalf("Failed to make the bucket redundant: %v", err) 1618 } 1619 1620 // Step 3: PUT objects in the bucket 1621 m.puts() 1622 m.ensureNumCopies(baseParams, copies, false) 1623 1624 // Step 4: Remove a mountpath 1625 mpath := mpList.Available[0] 1626 tlog.Logf("Remove mountpath %s on target %s\n", mpath, target.ID()) 1627 err = api.DetachMountpath(baseParams, target, mpath, false /*dont-resil*/) 1628 tassert.CheckFatal(t, err) 1629 1630 tools.WaitForResilvering(t, baseParams, target) 1631 1632 // Step 5: GET objects from the bucket 1633 m.gets(nil, false) 1634 1635 m.ensureNumCopies(baseParams, copies, true /*greaterOk*/) 1636 1637 // Step 6: Add previously removed mountpath 1638 tlog.Logf("Add mountpath %s on target %s\n", mpath, target.ID()) 1639 err = api.AttachMountpath(baseParams, target, mpath) 1640 tassert.CheckFatal(t, err) 1641 1642 tools.WaitForResilvering(t, baseParams, target) 1643 1644 m.ensureNumCopies(baseParams, copies, true) 1645 m.ensureNoGetErrors() 1646 m.ensureNumMountpaths(target, mpList) 1647 } 1648 1649 func TestGetFromMirroredWithLostMountpathAllExceptOne(t *testing.T) { 1650 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1651 m := ioContext{ 1652 t: t, 1653 num: 10000, 1654 numGetsEachFile: 4, 1655 } 1656 m.initAndSaveState(true /*cleanup*/) 1657 baseParams := tools.BaseAPIParams(m.proxyURL) 1658 1659 // Select a random target 1660 target, _ := m.smap.GetRandTarget() 1661 mpList, err := api.GetMountpaths(baseParams, target) 1662 mpathCount := len(mpList.Available) 1663 ensureNoDisabledMountpaths(t, target, mpList) 1664 tassert.CheckFatal(t, err) 1665 if mpathCount < 3 { 1666 t.Skipf("%s requires at least 3 mountpaths per target (%s has %d)", t.Name(), target.StringEx(), mpathCount) 1667 } 1668 1669 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1670 1671 // Make the bucket n-copy mirrored 1672 _, err = api.SetBucketProps(baseParams, m.bck, &cmn.BpropsToSet{ 1673 Mirror: &cmn.MirrorConfToSet{ 1674 Enabled: apc.Ptr(true), 1675 Copies: apc.Ptr(int64(mpathCount)), 1676 }, 1677 }) 1678 if err != nil { 1679 t.Fatalf("Failed to make the bucket redundant: %v", err) 1680 } 1681 1682 // PUT 1683 m.puts() 1684 m.ensureNumCopies(baseParams, mpathCount, false /*greaterOk*/) 1685 1686 // Remove all mountpaths except one 1687 tlog.Logf("Remove all except one (%q) mountpath on target %s\n", mpList.Available[0], target.StringEx()) 1688 for i, mpath := range mpList.Available[1:] { 1689 err = api.DetachMountpath(baseParams, target, mpath, false /*dont-resil*/) 1690 if err != nil { 1691 for j := range i { 1692 api.AttachMountpath(baseParams, target, mpList.Available[j+1]) 1693 } 1694 tassert.CheckFatal(t, err) 1695 } 1696 time.Sleep(time.Second) 1697 } 1698 1699 tools.WaitForResilvering(t, baseParams, target) 1700 1701 // Wait for async mirroring to finish 1702 flt := xact.ArgsMsg{Kind: apc.ActPutCopies, Bck: m.bck} 1703 api.WaitForXactionIdle(baseParams, &flt) 1704 time.Sleep(time.Second) // pending writes 1705 1706 // GET 1707 m.gets(nil, false) 1708 1709 // Reattach previously removed mountpaths 1710 tlog.Logf("Reattach mountpaths at %s\n", target.StringEx()) 1711 for _, mpath := range mpList.Available[1:] { 1712 err = api.AttachMountpath(baseParams, target, mpath) 1713 tassert.CheckFatal(t, err) 1714 time.Sleep(time.Second) 1715 } 1716 1717 tools.WaitForResilvering(t, baseParams, nil) 1718 1719 m.ensureNumCopies(baseParams, mpathCount, true /*greaterOk*/) 1720 m.ensureNoGetErrors() 1721 m.ensureNumMountpaths(target, mpList) 1722 } 1723 1724 // TODO: remove all except one mountpath, run short, reduce sleep, increase stress... 1725 func TestGetNonRedundantWithDisabledMountpath(t *testing.T) { 1726 testNonRedundantMpathDD(t, apc.ActMountpathDisable) 1727 } 1728 1729 func TestGetNonRedundantWithDetachedMountpath(t *testing.T) { 1730 testNonRedundantMpathDD(t, apc.ActMountpathDetach) 1731 } 1732 1733 func testNonRedundantMpathDD(t *testing.T, action string) { 1734 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1735 m := ioContext{ 1736 t: t, 1737 num: 1000, 1738 numGetsEachFile: 2, 1739 } 1740 m.initAndSaveState(true /*cleanup*/) 1741 baseParams := tools.BaseAPIParams(m.proxyURL) 1742 1743 // Select a random target 1744 target, _ := m.smap.GetRandTarget() 1745 mpList, err := api.GetMountpaths(baseParams, target) 1746 tassert.CheckFatal(t, err) 1747 ensureNoDisabledMountpaths(t, target, mpList) 1748 1749 mpathCount := len(mpList.Available) 1750 if mpathCount < 2 { 1751 t.Skipf("%s requires at least 2 mountpaths per target (%s has %d)", t.Name(), target.StringEx(), mpathCount) 1752 } 1753 1754 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1755 1756 // PUT 1757 m.puts() 1758 1759 tlog.Logf("%s %q at target %s\n", action, mpList.Available[0], target.StringEx()) 1760 if action == apc.ActMountpathDisable { 1761 err = api.DisableMountpath(baseParams, target, mpList.Available[0], false /*dont-resil*/) 1762 } else { 1763 err = api.DetachMountpath(baseParams, target, mpList.Available[0], false /*dont-resil*/) 1764 } 1765 tassert.CheckFatal(t, err) 1766 1767 tools.WaitForResilvering(t, baseParams, target) 1768 1769 // GET 1770 m.gets(nil, false) 1771 1772 // Add previously disabled or detached mountpath 1773 if action == apc.ActMountpathDisable { 1774 tlog.Logf("Re-enable %q at target %s\n", mpList.Available[0], target.StringEx()) 1775 err = api.EnableMountpath(baseParams, target, mpList.Available[0]) 1776 } else { 1777 tlog.Logf("Re-attach %q at target %s\n", mpList.Available[0], target.StringEx()) 1778 err = api.AttachMountpath(baseParams, target, mpList.Available[0]) 1779 } 1780 tassert.CheckFatal(t, err) 1781 1782 tools.WaitForResilvering(t, baseParams, target) 1783 1784 m.ensureNoGetErrors() 1785 m.ensureNumMountpaths(target, mpList) 1786 } 1787 1788 // 1. Start rebalance 1789 // 2. Start changing the primary proxy 1790 // 3. IC must survive and rebalance must finish 1791 func TestICRebalance(t *testing.T) { 1792 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, RequiredDeployment: tools.ClusterTypeLocal}) 1793 1794 var ( 1795 m = ioContext{ 1796 t: t, 1797 num: 25000, 1798 } 1799 rebID string 1800 ) 1801 1802 m.initAndSaveState(true /*cleanup*/) 1803 m.expectTargets(3) 1804 m.expectProxies(3) 1805 psi, err := m.smap.GetRandProxy(true /*exclude primary*/) 1806 tassert.CheckFatal(t, err) 1807 m.proxyURL = psi.URL(cmn.NetPublic) 1808 icNode := tools.GetICProxy(t, m.smap, psi.ID()) 1809 1810 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1811 1812 m.puts() 1813 1814 baseParams := tools.BaseAPIParams(m.proxyURL) 1815 1816 tlog.Logf("Manually initiated rebalance\n") 1817 rebID, err = api.StartXaction(baseParams, &xact.ArgsMsg{Kind: apc.ActRebalance}, "") 1818 tassert.CheckFatal(t, err) 1819 1820 xargs := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceStartTimeout} 1821 api.WaitForXactionNode(baseParams, &xargs, xactSnapRunning) 1822 1823 tlog.Logf("Killing %s\n", icNode.StringEx()) 1824 // cmd and args are the original command line of how the proxy is started 1825 cmd, err := tools.KillNode(icNode) 1826 tassert.CheckFatal(t, err) 1827 1828 proxyCnt := m.smap.CountActivePs() 1829 smap, err := tools.WaitForClusterState(m.proxyURL, "designate new primary", m.smap.Version, proxyCnt-1, 0) 1830 tassert.CheckError(t, err) 1831 1832 // re-construct the command line to start the original proxy but add the current primary proxy to the args 1833 err = tools.RestoreNode(cmd, false, "proxy (prev primary)") 1834 tassert.CheckFatal(t, err) 1835 1836 smap, err = tools.WaitForClusterState(m.proxyURL, "restore", smap.Version, proxyCnt, 0) 1837 tassert.CheckFatal(t, err) 1838 if _, ok := smap.Pmap[psi.ID()]; !ok { 1839 t.Fatalf("Previous primary proxy did not rejoin the cluster") 1840 } 1841 checkSmaps(t, m.proxyURL) 1842 1843 tlog.Logf("Wait for rebalance: %s\n", rebID) 1844 args := xact.ArgsMsg{ID: rebID, Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 1845 _, _ = api.WaitForXactionIC(baseParams, &args) 1846 1847 m.waitAndCheckCluState() 1848 } 1849 1850 // 1. Start decommissioning a target with rebalance 1851 // 2. Start changing the primary proxy 1852 // 3. IC must survive, rebalance must finish, and the target must be gone 1853 func TestICDecommission(t *testing.T) { 1854 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, RequiredDeployment: tools.ClusterTypeLocal}) 1855 1856 var ( 1857 err error 1858 m = ioContext{ 1859 t: t, 1860 num: 25000, 1861 } 1862 ) 1863 1864 m.initAndSaveState(true /*cleanup*/) 1865 m.expectTargets(3) 1866 m.expectProxies(3) 1867 psi, err := m.smap.GetRandProxy(true /*exclude primary*/) 1868 tassert.CheckFatal(t, err) 1869 m.proxyURL = psi.URL(cmn.NetPublic) 1870 tlog.Logf("Monitoring node: %s\n", psi.StringEx()) 1871 icNode := tools.GetICProxy(t, m.smap, psi.ID()) 1872 1873 tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/) 1874 1875 m.puts() 1876 1877 baseParams := tools.BaseAPIParams(m.proxyURL) 1878 tsi, err := m.smap.GetRandTarget() 1879 tassert.CheckFatal(t, err) 1880 1881 args := &apc.ActValRmNode{DaemonID: tsi.ID(), SkipRebalance: true} 1882 _, err = api.StartMaintenance(baseParams, args) 1883 tassert.CheckFatal(t, err) 1884 1885 defer func() { 1886 args := &apc.ActValRmNode{DaemonID: tsi.ID()} 1887 rebID, err := api.StopMaintenance(baseParams, args) 1888 tassert.CheckFatal(t, err) 1889 tools.WaitForRebalanceByID(t, baseParams, rebID) 1890 tassert.CheckFatal(t, err) 1891 }() 1892 1893 tassert.CheckFatal(t, err) 1894 tlog.Logf("Killing %s\n", icNode.StringEx()) 1895 1896 // cmd and args are the original command line of how the proxy is started 1897 cmd, err := tools.KillNode(icNode) 1898 tassert.CheckFatal(t, err) 1899 1900 proxyCnt := m.smap.CountActivePs() 1901 smap, err := tools.WaitForClusterState(m.proxyURL, "designate new primary", m.smap.Version, proxyCnt-1, 0) 1902 tassert.CheckError(t, err) 1903 1904 // re-construct the command line to start the original proxy but add the current primary proxy to the args 1905 err = tools.RestoreNode(cmd, false, "proxy (prev primary)") 1906 tassert.CheckFatal(t, err) 1907 1908 smap, err = tools.WaitForClusterState(m.proxyURL, "restore", smap.Version, proxyCnt, 0) 1909 tassert.CheckFatal(t, err) 1910 if _, ok := smap.Pmap[psi.ID()]; !ok { 1911 t.Fatalf("Previous primary proxy did not rejoin the cluster") 1912 } 1913 checkSmaps(t, m.proxyURL) 1914 1915 _, err = tools.WaitForClusterState(m.proxyURL, "decommission target", 1916 m.smap.Version, m.smap.CountProxies(), m.smap.CountTargets()-1) 1917 tassert.CheckFatal(t, err) 1918 } 1919 1920 func TestSingleResilver(t *testing.T) { 1921 m := ioContext{t: t} 1922 m.initAndSaveState(true /*cleanup*/) 1923 baseParams := tools.BaseAPIParams(m.proxyURL) 1924 1925 // Select a random target 1926 target, _ := m.smap.GetRandTarget() 1927 1928 // Start resilvering just on the target 1929 args := xact.ArgsMsg{Kind: apc.ActResilver, DaemonID: target.ID()} 1930 id, err := api.StartXaction(baseParams, &args, "") 1931 tassert.CheckFatal(t, err) 1932 1933 // Wait for specific resilvering x[id] 1934 args = xact.ArgsMsg{ID: id, Kind: apc.ActResilver, Timeout: tools.RebalanceTimeout} 1935 _, err = api.WaitForXactionIC(baseParams, &args) 1936 tassert.CheckFatal(t, err) 1937 1938 // Make sure other nodes were not resilvered 1939 args = xact.ArgsMsg{ID: id} 1940 snaps, err := api.QueryXactionSnaps(baseParams, &args) 1941 tassert.CheckFatal(t, err) 1942 tassert.Errorf(t, len(snaps) == 1, "expected only 1 resilver") 1943 }