github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/fshc_test.go (about) 1 // Package integration_test. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package integration_test 6 7 import ( 8 "fmt" 9 "net/http" 10 "os" 11 "path" 12 "sync" 13 "testing" 14 "time" 15 16 "github.com/NVIDIA/aistore/api" 17 "github.com/NVIDIA/aistore/api/apc" 18 "github.com/NVIDIA/aistore/cmn" 19 "github.com/NVIDIA/aistore/cmn/cos" 20 "github.com/NVIDIA/aistore/core/meta" 21 "github.com/NVIDIA/aistore/tools" 22 "github.com/NVIDIA/aistore/tools/readers" 23 "github.com/NVIDIA/aistore/tools/tassert" 24 "github.com/NVIDIA/aistore/tools/tlog" 25 "github.com/NVIDIA/aistore/tools/trand" 26 "github.com/NVIDIA/aistore/xact" 27 ) 28 29 const ( 30 fshcDetectTimeMax = time.Second * 10 31 fshcRunTimeMax = time.Second * 15 32 fshcDir = "fschecker" 33 ) 34 35 type checkerMD struct { 36 t *testing.T 37 seed int64 38 numObjs int 39 proxyURL string 40 bck cmn.Bck 41 smap *meta.Smap 42 mpList meta.NodeMap 43 allMps map[string]*apc.MountpathList 44 origAvail int 45 fileSize int64 46 baseParams api.BaseParams 47 chstop chan struct{} 48 chfail chan struct{} 49 wg *sync.WaitGroup 50 } 51 52 func newCheckerMD(t *testing.T) *checkerMD { 53 md := &checkerMD{ 54 t: t, 55 seed: 300, 56 proxyURL: tools.RandomProxyURL(), 57 bck: cmn.Bck{ 58 Name: testBucketName, 59 Provider: apc.AIS, 60 }, 61 fileSize: 64 * cos.KiB, 62 mpList: make(meta.NodeMap, 10), 63 allMps: make(map[string]*apc.MountpathList, 10), 64 chstop: make(chan struct{}), 65 chfail: make(chan struct{}), 66 wg: &sync.WaitGroup{}, 67 } 68 69 md.init() 70 md.numObjs = 20 * len(md.mpList) 71 tlog.Logf("Create %d objects[%d mountpaths] for test\n", md.numObjs, len(md.mpList)) 72 73 return md 74 } 75 76 func (md *checkerMD) init() { 77 md.baseParams = tools.BaseAPIParams(md.proxyURL) 78 md.smap = tools.GetClusterMap(md.t, md.proxyURL) 79 80 for targetID, tsi := range md.smap.Tmap { 81 tlog.Logf("Target: %s\n", targetID) 82 lst, err := api.GetMountpaths(md.baseParams, tsi) 83 tassert.CheckFatal(md.t, err) 84 tlog.Logf(" Mountpaths: %v\n", lst) 85 86 for _, mpath := range lst.Available { 87 si, ok := md.mpList[mpath] 88 tassert.Errorf(md.t, !ok, "duplication (%s, %s, %s)", si, mpath, tsi) 89 md.mpList[mpath] = tsi 90 } 91 md.allMps[targetID] = lst 92 93 md.origAvail += len(lst.Available) 94 } 95 } 96 97 func (md *checkerMD) ensureNumMountpaths(target *meta.Snode, mpList *apc.MountpathList) { 98 ensureNumMountpaths(md.t, target, mpList) 99 } 100 101 func (md *checkerMD) randomTargetMpath() (target *meta.Snode, mpath string, mpathMap *apc.MountpathList) { 102 // select random target and mountpath 103 for m, t := range md.mpList { 104 target, mpath = t, m 105 mpathMap = md.allMps[target.ID()] 106 break 107 } 108 return 109 } 110 111 func (md *checkerMD) runTestAsync(method string, target *meta.Snode, mpath string, mpathList *apc.MountpathList, suffix string) { 112 md.wg.Add(1) 113 go runAsyncJob(md.t, md.bck, md.wg, method, mpath, fileNames, md.chfail, md.chstop, suffix) 114 // let the job run for a while and then make a mountpath broken 115 time.Sleep(2 * time.Second) 116 md.chfail <- struct{}{} 117 if detected := waitForMountpathChanges(md.t, target, len(mpathList.Available)-1, len(mpathList.Disabled)+1, true); detected { 118 // let the job run for a while with broken mountpath, so FSHC detects the trouble 119 time.Sleep(2 * time.Second) 120 md.chstop <- struct{}{} 121 } 122 md.wg.Wait() 123 124 repairMountpath(md.t, target, mpath, len(mpathList.Available), len(mpathList.Disabled), suffix) 125 } 126 127 func (md *checkerMD) runTestSync(method string, target *meta.Snode, mpath string, mpathList *apc.MountpathList, 128 objList []string, suffix string) { 129 breakMountpath(md.t, mpath, suffix) 130 defer repairMountpath(md.t, target, mpath, len(mpathList.Available), len(mpathList.Disabled), suffix) 131 132 switch method { 133 case http.MethodPut: 134 p, err := api.HeadBucket(md.baseParams, md.bck, true /* don't add */) 135 tassert.CheckFatal(md.t, err) 136 for _, objName := range objList { 137 r, _ := readers.NewRand(md.fileSize, p.Cksum.Type) 138 _, err := api.PutObject(&api.PutArgs{ 139 BaseParams: md.baseParams, 140 Bck: md.bck, 141 ObjName: path.Join(fshcDir, objName), 142 Reader: r, 143 Size: uint64(md.fileSize), 144 }) 145 if err != nil { 146 tlog.Logf("%s: %v\n", objName, err) 147 } 148 } 149 case http.MethodGet: 150 for _, objName := range objList { 151 // GetObject must fail - so no error checking 152 _, err := api.GetObject(md.baseParams, md.bck, objName, nil) 153 if err == nil { 154 md.t.Errorf("Get %q must fail", objName) 155 } 156 } 157 } 158 159 if detected := waitForMountpathChanges(md.t, target, len(mpathList.Available)-1, len(mpathList.Disabled)+1, false); detected { 160 md.t.Error("PUT objects to a broken mountpath should not disable the mountpath when FSHC is disabled") 161 } 162 } 163 164 func waitForMountpathChanges(t *testing.T, target *meta.Snode, availLen, disabledLen int, failIfDiffer bool) bool { 165 var ( 166 err error 167 newMpaths *apc.MountpathList 168 baseParams = tools.BaseAPIParams() 169 ) 170 171 detectStart := time.Now() 172 detectLimit := time.Now().Add(fshcDetectTimeMax) 173 174 for detectLimit.After(time.Now()) { 175 newMpaths, err = api.GetMountpaths(baseParams, target) 176 if err != nil { 177 t.Errorf("Failed to read target mountpaths: %v\n", err) 178 break 179 } 180 if len(newMpaths.Disabled) == disabledLen { 181 break 182 } 183 time.Sleep(time.Millisecond * 100) 184 } 185 detectTime := time.Since(detectStart) 186 tlog.Logf("passed %v\n", detectTime) 187 188 if len(newMpaths.Disabled) == disabledLen && len(newMpaths.Available) == availLen { 189 tlog.Logf("Check is successful in %v\n", detectTime) 190 return true 191 } 192 193 if !failIfDiffer { 194 return false 195 } 196 197 tlog.Logf("Current mpath list: %v\n", newMpaths) 198 if len(newMpaths.Disabled) != disabledLen { 199 t.Errorf("Disabled mpath count mismatch, old count: %v, new list: %v", 200 disabledLen, newMpaths.Disabled) 201 } else if len(newMpaths.Available) != availLen { 202 t.Errorf("Available mpath count mismatch, old count: %v, new list: %v", 203 availLen, newMpaths.Available) 204 } 205 return false 206 } 207 208 // Simulating mountpath death requested. 209 // It is the easiest way to simulate: stop putting data and 210 // replace the mountpath with regular file. If we do not stop 211 // putting objects it recreates the mountpath and does not fail 212 func breakMountpath(t *testing.T, mpath, suffix string) { 213 os.Rename(mpath, mpath+suffix) 214 f, err := os.OpenFile(mpath, os.O_CREATE|os.O_WRONLY, cos.PermRWR) 215 if err != nil { 216 t.Errorf("Failed to create file: %v", err) 217 } 218 f.Close() 219 } 220 221 func repairMountpath(t *testing.T, target *meta.Snode, mpath string, availLen, disabledLen int, suffix string) { 222 var ( 223 err error 224 baseParams = tools.BaseAPIParams() 225 ) 226 227 // "broken" mpath does no exist, nothing to restore 228 if err := cos.Stat(mpath + suffix); err != nil { 229 return 230 } 231 // cleanup 232 // restore original mountpath 233 os.Remove(mpath) 234 cos.Rename(mpath+suffix, mpath) 235 236 // ask fschecker to check all mountpath - it should make disabled 237 // mountpath back to available list 238 api.EnableMountpath(baseParams, target, mpath) 239 tlog.Logln("Recheck mountpaths") 240 detectStart := time.Now() 241 detectLimit := time.Now().Add(fshcDetectTimeMax) 242 var mpaths *apc.MountpathList 243 // Wait for fsckeeper detects that the mountpath is accessible now 244 for detectLimit.After(time.Now()) { 245 mpaths, err = api.GetMountpaths(baseParams, target) 246 if err != nil { 247 t.Errorf("Failed to read target mountpaths: %v\n", err) 248 break 249 } 250 if len(mpaths.Disabled) == disabledLen && len(mpaths.Available) == availLen { 251 break 252 } 253 time.Sleep(time.Second) 254 } 255 256 // final test checks - available and disabled lists must equal list 257 // before starting the test 258 if len(mpaths.Disabled) != disabledLen { 259 t.Errorf("Failed mountpath is still disabled in %v\nExpected disabled count: %d\nNew list:%v\n", 260 time.Since(detectStart), disabledLen, mpaths.Disabled) 261 } else if len(mpaths.Available) != availLen { 262 t.Errorf("Failed mountpath is not back in %v.\nExpected available count: %d\nNew list:%v\n", 263 time.Since(detectStart), availLen, mpaths.Available) 264 } 265 } 266 267 func runAsyncJob(t *testing.T, bck cmn.Bck, wg *sync.WaitGroup, op, mpath string, filelist []string, chfail, 268 chstop chan struct{}, suffix string) { 269 defer wg.Done() 270 271 const fileSize = 64 * cos.KiB 272 var ( 273 proxyURL = tools.RandomProxyURL() 274 baseParams = tools.BaseAPIParams(proxyURL) 275 ) 276 277 tlog.Logf("Testing mpath fail detection on %s\n", op) 278 stopTime := time.Now().Add(fshcRunTimeMax) 279 280 p, err := api.HeadBucket(baseParams, bck, true /* don't add */) 281 tassert.CheckFatal(t, err) 282 283 for stopTime.After(time.Now()) { 284 errCh := make(chan error, len(filelist)) 285 objsPutCh := make(chan string, len(filelist)) 286 287 for _, fname := range filelist { 288 select { 289 case <-chfail: 290 breakMountpath(t, mpath, suffix) 291 case <-chstop: 292 return 293 default: 294 // do nothing and just start the next loop 295 } 296 297 switch op { 298 case "PUT": 299 r, _ := readers.NewRand(fileSize, p.Cksum.Type) 300 api.PutObject(&api.PutArgs{ 301 BaseParams: baseParams, 302 Bck: bck, 303 ObjName: path.Join(fshcDir, fname), 304 Reader: r, 305 Size: fileSize, 306 }) 307 case "GET": 308 api.GetObject(baseParams, bck, path.Join(fshcDir, fname), nil) 309 time.Sleep(time.Millisecond * 10) 310 default: 311 t.Errorf("Invalid operation: %s", op) 312 } 313 } 314 315 close(errCh) 316 close(objsPutCh) 317 } 318 } 319 320 func TestFSCheckerDetectionEnabled(t *testing.T) { 321 // TODO -- FIXME: 322 // revise all fs-checker tests that manipulate mountpaths, make sure 323 // those (mountpaths) are always getting restored correctly when (and if) a test fails - 324 // then remove the "skipping" - here and elsewhere 325 if true { 326 t.Skipf("skipping %s", t.Name()) 327 } 328 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 329 330 var ( 331 md = newCheckerMD(t) 332 suffix = "-" + trand.String(5) 333 ) 334 335 if md.origAvail == 0 { 336 t.Fatal("No available mountpaths found") 337 } 338 339 tools.CreateBucket(t, md.proxyURL, md.bck, nil, true /*cleanup*/) 340 selectedTarget, selectedMpath, selectedMpathList := md.randomTargetMpath() 341 tlog.Logf("mountpath %s of %s is selected for the test\n", selectedMpath, selectedTarget.StringEx()) 342 defer func() { 343 if err := api.DetachMountpath(md.baseParams, selectedTarget, selectedMpath, true /*dont-resil*/); err != nil { 344 t.Logf("Failed to remove mpath %s of %s: %v", selectedMpath, selectedTarget.StringEx(), err) 345 } 346 if err := api.AttachMountpath(md.baseParams, selectedTarget, selectedMpath); err != nil { 347 t.Logf("Failed to add mpath %s of %s: %v", selectedMpath, selectedTarget.StringEx(), err) 348 } 349 350 tools.WaitForResilvering(t, md.baseParams, nil) 351 352 md.ensureNumMountpaths(selectedTarget, md.allMps[selectedTarget.ID()]) 353 }() 354 355 // generate some filenames to PUT to them in a loop 356 generateRandomNames(md.numObjs) 357 358 // Checking detection on object PUT 359 md.runTestAsync(http.MethodPut, selectedTarget, selectedMpath, selectedMpathList, suffix) 360 // Checking detection on object GET 361 md.runTestAsync(http.MethodGet, selectedTarget, selectedMpath, selectedMpathList, suffix) 362 363 // Checking that reading "bad" objects does not disable mpath if the mpath is OK 364 tlog.Logf("Reading non-existing objects: read is expected to fail but mountpath must be available\n") 365 for n := 1; n < 10; n++ { 366 objName := fmt.Sprintf("%s/o%d", fshcDir, n) 367 if _, err := api.GetObject(md.baseParams, md.bck, objName, nil); err == nil { 368 t.Error("Should not be able to GET non-existing objects") 369 } 370 } 371 if detected := waitForMountpathChanges(t, selectedTarget, len(selectedMpathList.Available), len(selectedMpathList.Disabled), false); !detected { 372 t.Error("GETting non-existing objects should not disable mountpath") 373 repairMountpath(t, selectedTarget, selectedMpath, len(selectedMpathList.Available), len(selectedMpathList.Disabled), suffix) 374 } 375 } 376 377 func TestFSCheckerDetectionDisabled(t *testing.T) { 378 if true { 379 t.Skipf("skipping %s", t.Name()) 380 } 381 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 382 383 var ( 384 md = newCheckerMD(t) 385 suffix = "-" + trand.String(5) 386 ) 387 388 if md.origAvail == 0 { 389 t.Fatal("No available mountpaths found") 390 } 391 392 tlog.Logf("*** Testing with disabled FSHC***\n") 393 tools.SetClusterConfig(t, cos.StrKVs{"fshc.enabled": "false"}) 394 defer tools.SetClusterConfig(t, cos.StrKVs{"fshc.enabled": "true"}) 395 396 selectedTarget, selectedMpath, selectedMap := md.randomTargetMpath() 397 tlog.Logf("mountpath %s of %s is selected for the test\n", selectedMpath, selectedTarget.StringEx()) 398 tools.CreateBucket(t, md.proxyURL, md.bck, nil, true /*cleanup*/) 399 defer func() { 400 if err := api.DetachMountpath(md.baseParams, selectedTarget, selectedMpath, true /*dont-resil*/); err != nil { 401 t.Logf("Failed to remove mpath %s of %s: %v", selectedMpath, selectedTarget.StringEx(), err) 402 } 403 if err := api.AttachMountpath(md.baseParams, selectedTarget, selectedMpath); err != nil { 404 t.Logf("Failed to add mpath %s of %s: %v", selectedMpath, selectedTarget.StringEx(), err) 405 } 406 407 tools.WaitForResilvering(t, md.baseParams, nil) 408 409 md.ensureNumMountpaths(selectedTarget, md.allMps[selectedTarget.ID()]) 410 }() 411 412 // generate a short list of file to run the test (to avoid flooding the log with false errors) 413 objList := make([]string, 0, 5) 414 for n := range 5 { 415 objName := fmt.Sprintf("obj-fshc-%d", n) 416 objList = append(objList, objName) 417 } 418 419 // Checking detection on object PUT 420 md.runTestSync(http.MethodPut, selectedTarget, selectedMpath, selectedMap, objList, suffix) 421 // Checking detection on object GET 422 md.runTestSync(http.MethodGet, selectedTarget, selectedMpath, selectedMap, objList, suffix) 423 } 424 425 func TestFSCheckerEnablingMountpath(t *testing.T) { 426 if true { 427 t.Skipf("skipping %s", t.Name()) 428 } 429 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 430 var ( 431 proxyURL = tools.RandomProxyURL() 432 baseParams = tools.BaseAPIParams(proxyURL) 433 smap = tools.GetClusterMap(t, proxyURL) 434 mpList = make(meta.NodeMap, 10) 435 origAvail = 0 436 ) 437 438 for targetID, tsi := range smap.Tmap { 439 tlog.Logf("Target: %s\n", targetID) 440 lst, err := api.GetMountpaths(baseParams, tsi) 441 tassert.CheckFatal(t, err) 442 tlog.Logf(" Mountpaths: %v\n", lst) 443 444 for _, mpath := range lst.Available { 445 mpList[mpath] = tsi 446 } 447 448 origAvail += len(lst.Available) 449 } 450 451 if origAvail == 0 { 452 t.Fatal("No available mountpaths found") 453 } 454 455 // select random target and mountpath 456 var ( 457 selectedTarget *meta.Snode 458 selectedMpath string 459 ) 460 for m, t := range mpList { 461 selectedTarget, selectedMpath = t, m 462 break 463 } 464 465 origMpl, err := api.GetMountpaths(baseParams, selectedTarget) 466 tassert.CheckFatal(t, err) 467 468 err = api.EnableMountpath(baseParams, selectedTarget, selectedMpath) 469 if err != nil { 470 t.Errorf("Enabling available mountpath should return success, got: %v", err) 471 } 472 473 err = api.EnableMountpath(baseParams, selectedTarget, selectedMpath+"some_text") 474 if err == nil { 475 t.Errorf("Enabling non-existing mountpath should return error") 476 } else { 477 status := api.HTTPStatus(err) 478 if status != http.StatusNotFound { 479 t.Errorf("Expected status %d, got %d, %v", http.StatusNotFound, status, err) 480 } 481 } 482 tools.WaitForResilvering(t, baseParams, selectedTarget) 483 484 ensureNumMountpaths(t, selectedTarget, origMpl) 485 } 486 487 func TestFSCheckerTargetDisableAllMountpaths(t *testing.T) { 488 if true { 489 t.Skipf("skipping %s", t.Name()) 490 } 491 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 492 var ( 493 target *meta.Snode 494 495 proxyURL = tools.RandomProxyURL() 496 baseParams = tools.BaseAPIParams() 497 smap = tools.GetClusterMap(t, proxyURL) 498 proxyCnt = smap.CountActivePs() 499 targetCnt = smap.CountActiveTs() 500 ) 501 502 if targetCnt < 2 { 503 t.Skip("The number of targets must be at least 2") 504 } 505 506 target, _ = smap.GetRandTarget() 507 oldMpaths, err := api.GetMountpaths(baseParams, target) 508 tassert.CheckFatal(t, err) 509 if len(oldMpaths.Available) == 0 { 510 t.Fatalf("Target %s does not have mountpaths", target) 511 } 512 513 tlog.Logf("Removing all mountpaths from target: %s\n", target.StringEx()) 514 for _, mpath := range oldMpaths.Available { 515 err = api.DisableMountpath(baseParams, target, mpath, true /*dont-resil*/) 516 tassert.CheckFatal(t, err) 517 } 518 519 smap, err = tools.WaitForClusterState(proxyURL, "all mountpaths disabled", smap.Version, proxyCnt, targetCnt-1) 520 tassert.CheckFatal(t, err) 521 tlog.Logf("Wait for rebalance (triggered by %s leaving the cluster after having lost all mountpaths)\n", 522 target.StringEx()) 523 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 524 _, _ = api.WaitForXactionIC(baseParams, &args) 525 526 tlog.Logf("Restoring target %s mountpaths\n", target.ID()) 527 for _, mpath := range oldMpaths.Available { 528 err = api.EnableMountpath(baseParams, target, mpath) 529 tassert.CheckFatal(t, err) 530 } 531 532 _, err = tools.WaitForClusterState(proxyURL, "all mountpaths enabled", smap.Version, proxyCnt, targetCnt) 533 tassert.CheckFatal(t, err) 534 535 tlog.Logf("Wait for rebalance (when target %s that has previously lost all mountpaths joins back)\n", target.StringEx()) 536 args = xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 537 _, _ = api.WaitForXactionIC(baseParams, &args) 538 539 tools.WaitForResilvering(t, baseParams, nil) 540 541 ensureNumMountpaths(t, target, oldMpaths) 542 } 543 544 func TestFSAddMountpathRestartNode(t *testing.T) { 545 if true { 546 t.Skipf("skipping %s", t.Name()) 547 } 548 var ( 549 target *meta.Snode 550 551 proxyURL = tools.RandomProxyURL() 552 baseParams = tools.BaseAPIParams() 553 smap = tools.GetClusterMap(t, proxyURL) 554 proxyCnt = smap.CountProxies() 555 targetCnt = smap.CountActiveTs() 556 tmpMpath = "/tmp/testmp" 557 ) 558 if targetCnt < 2 { 559 t.Skip("The number of targets must be at least 2") 560 } 561 target, _ = smap.GetRandTarget() 562 oldMpaths, err := api.GetMountpaths(baseParams, target) 563 tassert.CheckFatal(t, err) 564 numMpaths := len(oldMpaths.Available) 565 tassert.Fatalf(t, numMpaths != 0, "target %s doesn't have mountpaths", target.StringEx()) 566 567 cos.CreateDir(tmpMpath) 568 tlog.Logf("Adding mountpath to %s\n", target.StringEx()) 569 err = api.AttachMountpath(baseParams, target, tmpMpath) 570 tassert.CheckFatal(t, err) 571 572 tools.WaitForResilvering(t, baseParams, target) 573 574 t.Cleanup(func() { 575 api.DetachMountpath(baseParams, target, tmpMpath, true /*dont-resil*/) 576 time.Sleep(2 * time.Second) 577 os.Remove(tmpMpath) 578 579 ensureNumMountpaths(t, target, oldMpaths) 580 }) 581 582 newMpaths, err := api.GetMountpaths(baseParams, target) 583 tassert.CheckFatal(t, err) 584 585 tassert.Fatalf(t, numMpaths+1 == len(newMpaths.Available), 586 "should add new mountpath - available %d!=%d", numMpaths+1, len(newMpaths.Available)) 587 588 // Kill and restore target 589 tlog.Logf("Killing %s\n", target.StringEx()) 590 tcmd, err := tools.KillNode(target) 591 tassert.CheckFatal(t, err) 592 smap, err = tools.WaitForClusterState(proxyURL, "target removed", smap.Version, proxyCnt, targetCnt-1) 593 594 tassert.CheckError(t, err) 595 tools.RestoreNode(tcmd, false, "target") 596 smap, err = tools.WaitForClusterState(smap.Primary.URL(cmn.NetPublic), "target restored", smap.Version, 597 proxyCnt, targetCnt) 598 tassert.CheckFatal(t, err) 599 if _, ok := smap.Tmap[target.ID()]; !ok { 600 t.Fatalf("Removed target didn't rejoin") 601 } 602 tlog.Logf("Wait for rebalance\n") 603 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 604 _, _ = api.WaitForXactionIC(baseParams, &args) 605 606 // Check if the node has newly added mountpath 607 newMpaths, err = api.GetMountpaths(baseParams, target) 608 tassert.CheckFatal(t, err) 609 tassert.Fatalf(t, numMpaths+1 == len(newMpaths.Available), 610 "should include newly added mountpath after restore - available %d!=%d", numMpaths+1, len(newMpaths.Available)) 611 } 612 613 func TestFSDisableAllExceptOneMountpathRestartNode(t *testing.T) { 614 if true { 615 t.Skipf("skipping %s", t.Name()) 616 } 617 tools.CheckSkip(t, &tools.SkipTestArgs{ 618 Long: true, 619 MinMountpaths: 3, 620 MinTargets: 2, 621 RequiredDeployment: tools.ClusterTypeLocal, 622 }) 623 var ( 624 target *meta.Snode 625 626 smap = tools.GetClusterMap(t, tools.RandomProxyURL()) 627 baseParams = tools.BaseAPIParams() 628 proxyURL = smap.Primary.URL(cmn.NetPublic) 629 proxyCnt = smap.CountProxies() 630 targetCnt = smap.CountActiveTs() 631 enabled bool 632 ) 633 for _, tsi := range smap.Tmap { 634 target = tsi 635 break 636 } 637 638 oldMpaths, err := api.GetMountpaths(baseParams, target) 639 tassert.CheckFatal(t, err) 640 mpathCnt := len(oldMpaths.Available) 641 tlog.Logf("Target %s has %d mountpaths\n", target.ID(), mpathCnt) 642 643 // Disable, temporarily, all mountpaths except 1. 644 mpaths := oldMpaths.Available[:mpathCnt-1] 645 for _, mpath := range mpaths { 646 tlog.Logf("Disable mountpath %q at %s\n", mpath, target.StringEx()) 647 err = api.DisableMountpath(baseParams, target, mpath, false /*dont-resil*/) 648 tassert.CheckFatal(t, err) 649 } 650 tools.WaitForResilvering(t, baseParams, target) 651 652 t.Cleanup(func() { 653 if enabled { 654 return 655 } 656 for _, mpath := range mpaths { 657 api.EnableMountpath(baseParams, target, mpath) 658 } 659 time.Sleep(time.Second) 660 661 tools.WaitForResilvering(t, baseParams, target) 662 663 ensureNumMountpaths(t, target, oldMpaths) 664 }) 665 666 // Kill and restore target 667 tlog.Logf("Killing target %s\n", target.StringEx()) 668 tcmd, err := tools.KillNode(target) 669 tassert.CheckFatal(t, err) 670 smap, err = tools.WaitForClusterState(proxyURL, "remove target", smap.Version, proxyCnt, targetCnt-1) 671 tassert.CheckFatal(t, err) 672 673 time.Sleep(time.Second) 674 err = tools.RestoreNode(tcmd, false, "target") 675 tassert.CheckFatal(t, err) 676 smap, err = tools.WaitForClusterState(proxyURL, "restore", smap.Version, proxyCnt, targetCnt) 677 tassert.CheckFatal(t, err) 678 tassert.Fatalf(t, smap.GetTarget(target.ID()) != nil, "removed target didn't rejoin") 679 680 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 681 _, _ = api.WaitForXactionIC(baseParams, &args) 682 683 // Check if the the mountpaths are disabled after restart. 684 newMpaths, err := api.GetMountpaths(baseParams, target) 685 tassert.CheckError(t, err) 686 tassert.Errorf( 687 t, len(newMpaths.Available) == 1, 688 "unexpected count of available mountpaths, got: %d, expected: %d", 689 len(newMpaths.Available), 1, 690 ) 691 tassert.Errorf( 692 t, len(newMpaths.Disabled) == mpathCnt-1, 693 "unexpected count of disabled mountpaths, got: %d, expected: %d", 694 len(newMpaths.Disabled), mpathCnt-1, 695 ) 696 697 // Re-enable the mountpaths 698 for _, mpath := range mpaths { 699 err = api.EnableMountpath(baseParams, target, mpath) 700 tassert.CheckFatal(t, err) 701 } 702 tools.WaitForResilvering(t, baseParams, target) 703 704 enabled = true 705 706 newMpaths, err = api.GetMountpaths(baseParams, target) 707 tassert.CheckFatal(t, err) 708 tassert.Errorf( 709 t, len(newMpaths.Available) == mpathCnt, 710 "unexpected count of available mountpaths, got: %d, expected: %d", 711 len(newMpaths.Available), mpathCnt, 712 ) 713 tassert.Errorf( 714 t, len(newMpaths.Disabled) == 0, 715 "unexpected count of disabled mountpaths, got: %d, expected: %d", 716 len(newMpaths.Disabled), 0, 717 ) 718 }