github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/maintain_test.go (about) 1 // Package integration_test. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package integration_test 6 7 import ( 8 "fmt" 9 "net/http" 10 "path/filepath" 11 "testing" 12 "time" 13 14 "github.com/NVIDIA/aistore/api" 15 "github.com/NVIDIA/aistore/api/apc" 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/fname" 19 "github.com/NVIDIA/aistore/core/meta" 20 "github.com/NVIDIA/aistore/tools" 21 "github.com/NVIDIA/aistore/tools/readers" 22 "github.com/NVIDIA/aistore/tools/tassert" 23 "github.com/NVIDIA/aistore/tools/tlog" 24 "github.com/NVIDIA/aistore/xact" 25 ) 26 27 func TestMaintenanceOnOff(t *testing.T) { 28 tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3}) 29 proxyURL := tools.RandomProxyURL(t) 30 smap := tools.GetClusterMap(t, proxyURL) 31 32 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 33 34 // Invalid target case 35 msg := &apc.ActValRmNode{DaemonID: "fakeID", SkipRebalance: true} 36 _, err := api.StartMaintenance(baseParams, msg) 37 tassert.Fatalf(t, err != nil, "Maintenance for invalid daemon ID succeeded") 38 39 mntTarget, _ := smap.GetRandTarget() 40 msg.DaemonID = mntTarget.ID() 41 baseParams := tools.BaseAPIParams(proxyURL) 42 _, err = api.StartMaintenance(baseParams, msg) 43 tassert.CheckFatal(t, err) 44 smap, err = tools.WaitForClusterState(proxyURL, "target in maintenance", 45 smap.Version, smap.CountActivePs(), smap.CountActiveTs()-1) 46 tassert.CheckFatal(t, err) 47 _, err = api.StopMaintenance(baseParams, msg) 48 tassert.CheckFatal(t, err) 49 _, err = tools.WaitForClusterState(proxyURL, "target is back", 50 smap.Version, smap.CountActivePs(), smap.CountTargets()) 51 tassert.CheckFatal(t, err) 52 _, err = api.StopMaintenance(baseParams, msg) 53 tassert.Fatalf(t, err != nil, "Canceling maintenance must fail for 'normal' daemon") 54 } 55 56 func TestMaintenanceListObjects(t *testing.T) { 57 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, MinTargets: 3}) 58 59 var ( 60 bck = cmn.Bck{Name: "maint-list", Provider: apc.AIS} 61 m = &ioContext{ 62 t: t, 63 num: 1500, 64 fileSize: cos.KiB, 65 fixedSize: true, 66 bck: bck, 67 proxyURL: proxyURL, 68 } 69 proxyURL = tools.RandomProxyURL(t) 70 baseParams = tools.BaseAPIParams(proxyURL) 71 origEntries = make(map[string]*cmn.LsoEnt, 1500) 72 ) 73 74 m.initAndSaveState(true /*cleanup*/) 75 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 76 77 m.puts() 78 // 1. Perform list-object and populate entries map 79 msg := &apc.LsoMsg{} 80 msg.AddProps(apc.GetPropsChecksum, apc.GetPropsVersion, apc.GetPropsCopies, apc.GetPropsSize) 81 lst, err := api.ListObjects(baseParams, bck, msg, api.ListArgs{}) 82 tassert.CheckFatal(t, err) 83 tassert.Fatalf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d", 84 m.num, len(lst.Entries)) 85 for _, en := range lst.Entries { 86 origEntries[en.Name] = en 87 } 88 89 // 2. Put a random target in maintenance mode 90 tsi, _ := m.smap.GetRandTarget() 91 tlog.Logf("Put target %s in maintenance mode\n", tsi.StringEx()) 92 actVal := &apc.ActValRmNode{DaemonID: tsi.ID(), SkipRebalance: false} 93 rebID, err := api.StartMaintenance(baseParams, actVal) 94 tassert.CheckFatal(t, err) 95 96 defer func() { 97 rebID, err = api.StopMaintenance(baseParams, actVal) 98 tassert.CheckFatal(t, err) 99 _, err = tools.WaitForClusterState(proxyURL, "target is back", 100 m.smap.Version, m.smap.CountActivePs(), m.smap.CountTargets()) 101 args := xact.ArgsMsg{ID: rebID, Timeout: tools.RebalanceTimeout} 102 _, err = api.WaitForXactionIC(baseParams, &args) 103 tassert.CheckFatal(t, err) 104 }() 105 106 m.smap, err = tools.WaitForClusterState(proxyURL, "target in maintenance", 107 m.smap.Version, m.smap.CountActivePs(), m.smap.CountActiveTs()-1) 108 tassert.CheckFatal(t, err) 109 110 tools.WaitForRebalanceByID(t, baseParams, rebID) 111 112 // 3. Check if we can list all the objects 113 lst, err = api.ListObjects(baseParams, bck, msg, api.ListArgs{}) 114 tassert.CheckFatal(t, err) 115 tassert.Fatalf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d", 116 m.num, len(lst.Entries)) 117 for _, en := range lst.Entries { 118 origEntry, ok := origEntries[en.Name] 119 tassert.Fatalf(t, ok, "object %s missing in original entries", en.Name) 120 if en.Checksum != origEntry.Checksum || 121 en.Version != origEntry.Version || 122 en.Flags != origEntry.Flags || 123 en.Copies != origEntry.Copies { 124 t.Errorf("some fields of object %q, don't match: %#v v/s %#v ", en.Name, en, origEntry) 125 } 126 } 127 } 128 129 func TestMaintenanceMD(t *testing.T) { 130 // NOTE: this test requires local deployment as it checks local filesystem for VMDs. 131 tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, RequiredDeployment: tools.ClusterTypeLocal}) 132 133 var ( 134 proxyURL = tools.RandomProxyURL(t) 135 smap = tools.GetClusterMap(t, proxyURL) 136 baseParams = tools.BaseAPIParams(proxyURL) 137 138 dcmTarget, _ = smap.GetRandTarget() 139 allTgtsMpaths = tools.GetTargetsMountpaths(t, smap, baseParams) 140 ) 141 142 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 143 144 t.Cleanup(func() { 145 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 146 api.WaitForXactionIC(baseParams, &args) 147 }) 148 149 tlog.Logf("Decommission %s\n", dcmTarget.StringEx()) 150 cmd := tools.GetRestoreCmd(dcmTarget) 151 msg := &apc.ActValRmNode{DaemonID: dcmTarget.ID(), SkipRebalance: true, KeepInitialConfig: true} 152 _, err := api.DecommissionNode(baseParams, msg) 153 tassert.CheckFatal(t, err) 154 155 _, err = tools.WaitForClusterState(proxyURL, "target decommissioned", smap.Version, smap.CountActivePs(), 156 smap.CountTargets()-1) 157 if err == tools.ErrTimedOutStabilize { 158 tlog.Logf("Retrying - checking with primary %s ...\n", smap.Primary.StringEx()) 159 proxyURL = smap.Primary.URL(cmn.NetPublic) 160 _, err = tools.WaitForClusterState(proxyURL, "target decommissioned", smap.Version, smap.CountActivePs(), 161 smap.CountTargets()-1) 162 } 163 if err != nil { 164 // fail the test but first, try to recover cluster membership 165 _ = tools.RestoreNode(cmd, false, "target") 166 time.Sleep(10 * time.Second) 167 tassert.CheckFatal(t, err) 168 } 169 170 vmdTargets := countVMDTargets(allTgtsMpaths) 171 tassert.Errorf(t, vmdTargets == smap.CountTargets()-1, "expected VMD to be found on %d targets, got %d.", 172 smap.CountTargets()-1, vmdTargets) 173 174 // restarting before the daemon fully terminates may result in "bind: address already in use" 175 err = tools.WaitNodePubAddrNotInUse(dcmTarget, time.Minute) 176 tassert.CheckFatal(t, err) 177 178 err = tools.RestoreNode(cmd, false, "target") 179 tassert.CheckFatal(t, err) 180 _, err = tools.WaitForClusterState(proxyURL, "target joined back", smap.Version, smap.CountActivePs(), 181 smap.CountTargets()) 182 tassert.CheckFatal(t, err) 183 184 smap = tools.GetClusterMap(t, proxyURL) 185 vmdTargets = countVMDTargets(allTgtsMpaths) 186 tassert.Errorf(t, vmdTargets == smap.CountTargets(), 187 "expected VMD to be found on all %d targets after joining cluster, got %d", 188 smap.CountTargets(), vmdTargets) 189 } 190 191 func TestMaintenanceDecommissionRebalance(t *testing.T) { 192 tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, RequiredDeployment: tools.ClusterTypeLocal, Long: true}) 193 var ( 194 proxyURL = tools.RandomProxyURL(t) 195 smap = tools.GetClusterMap(t, proxyURL) 196 baseParams = tools.BaseAPIParams(proxyURL) 197 objCount = 100 198 objPath = "ic-decomm/" 199 fileSize = cos.KiB 200 201 dcmTarget, _ = smap.GetRandTarget() 202 origTargetCount = smap.CountTargets() 203 origActiveProxyCount = smap.CountActivePs() 204 bck = cmn.Bck{Name: t.Name(), Provider: apc.AIS} 205 ) 206 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 207 208 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 209 for i := range objCount { 210 objName := fmt.Sprintf("%sobj%04d", objPath, i) 211 r, _ := readers.NewRand(int64(fileSize), cos.ChecksumXXHash) 212 _, err := api.PutObject(&api.PutArgs{ 213 BaseParams: baseParams, 214 Bck: bck, 215 ObjName: objName, 216 Reader: r, 217 Size: uint64(fileSize), 218 }) 219 tassert.CheckFatal(t, err) 220 } 221 222 tlog.Logf("Decommission %s\n", dcmTarget.StringEx()) 223 cmd := tools.GetRestoreCmd(dcmTarget) 224 msg := &apc.ActValRmNode{DaemonID: dcmTarget.ID(), RmUserData: true, KeepInitialConfig: true} 225 rebID, err := api.DecommissionNode(baseParams, msg) 226 tassert.CheckError(t, err) 227 _, err = tools.WaitForClusterState(proxyURL, "target decommissioned", 228 smap.Version, origActiveProxyCount, origTargetCount-1, dcmTarget.ID()) 229 230 if err == tools.ErrTimedOutStabilize { 231 tlog.Logf("Retrying - checking with primary %s ...\n", smap.Primary.StringEx()) 232 proxyURL = smap.Primary.URL(cmn.NetPublic) 233 _, err = tools.WaitForClusterState(proxyURL, "target decommissioned", 234 smap.Version, origActiveProxyCount, origTargetCount-1, dcmTarget.ID()) 235 } 236 if err != nil { 237 // fail the test but first, try to recover cluster membership 238 _ = tools.RestoreNode(cmd, false, "target") 239 time.Sleep(10 * time.Second) 240 tassert.CheckFatal(t, err) 241 } 242 243 tools.WaitForRebalanceByID(t, baseParams, rebID) 244 245 msgList := &apc.LsoMsg{Prefix: objPath} 246 lst, err := api.ListObjects(baseParams, bck, msgList, api.ListArgs{}) 247 tassert.CheckError(t, err) 248 if lst != nil && len(lst.Entries) != objCount { 249 t.Errorf("Wrong number of objects: have %d, expected %d", len(lst.Entries), objCount) 250 } 251 252 // restarting before the daemon fully terminates may result in "bind: address already in use" 253 err = tools.WaitNodePubAddrNotInUse(dcmTarget, time.Minute) 254 tassert.CheckFatal(t, err) 255 256 smap = tools.GetClusterMap(t, proxyURL) 257 err = tools.RestoreNode(cmd, false, "target") 258 tassert.CheckFatal(t, err) 259 smap, err = tools.WaitForClusterState(proxyURL, "target restored", smap.Version, 0, 0) 260 tassert.CheckFatal(t, err) 261 262 // If any node is in maintenance cancel the state 263 var dcm *meta.Snode 264 for _, node := range smap.Tmap { 265 if smap.InMaintOrDecomm(node) { 266 dcm = node 267 break 268 } 269 } 270 if dcm != nil { 271 tlog.Logf("Canceling maintenance for %s\n", dcm.ID()) 272 args := xact.ArgsMsg{Kind: apc.ActRebalance} 273 err = api.AbortXaction(baseParams, &args) 274 tassert.CheckError(t, err) 275 val := &apc.ActValRmNode{DaemonID: dcm.ID()} 276 rebID, err = api.StopMaintenance(baseParams, val) 277 tassert.CheckError(t, err) 278 tools.WaitForRebalanceByID(t, baseParams, rebID) 279 } else { 280 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 281 _, err = api.WaitForXactionIC(baseParams, &args) 282 tassert.CheckError(t, err) 283 } 284 285 lst, err = api.ListObjects(baseParams, bck, msgList, api.ListArgs{}) 286 tassert.CheckError(t, err) 287 if lst != nil && len(lst.Entries) != objCount { 288 t.Errorf("Invalid number of objects: %d, expected %d", len(lst.Entries), objCount) 289 } 290 } 291 292 func countVMDTargets(tsMpaths map[*meta.Snode][]string) (total int) { 293 for _, mpaths := range tsMpaths { 294 for _, mpath := range mpaths { 295 if err := cos.Stat(filepath.Join(mpath, fname.Vmd)); err == nil { 296 total++ 297 break 298 } 299 } 300 } 301 return 302 } 303 304 func TestMaintenanceRebalance(t *testing.T) { 305 tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, Long: true}) 306 var ( 307 bck = cmn.Bck{Name: "maint-reb", Provider: apc.AIS} 308 m = &ioContext{ 309 t: t, 310 num: 30, 311 fileSize: 512, 312 fixedSize: true, 313 bck: bck, 314 numGetsEachFile: 1, 315 proxyURL: proxyURL, 316 } 317 actVal = &apc.ActValRmNode{} 318 proxyURL = tools.RandomProxyURL(t) 319 baseParams = tools.BaseAPIParams(proxyURL) 320 ) 321 322 m.initAndSaveState(true /*cleanup*/) 323 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 324 origProxyCnt, origTargetCount := m.smap.CountActivePs(), m.smap.CountActiveTs() 325 326 m.puts() 327 tsi, _ := m.smap.GetRandTarget() 328 tlog.Logf("Removing %s\n", tsi.StringEx()) 329 restored := false 330 actVal.DaemonID = tsi.ID() 331 rebID, err := api.StartMaintenance(baseParams, actVal) 332 tassert.CheckError(t, err) 333 defer func() { 334 if !restored { 335 rebID, err := api.StopMaintenance(baseParams, actVal) 336 tassert.CheckError(t, err) 337 _, err = tools.WaitForClusterState( 338 proxyURL, 339 "target joined (2nd attempt)", 340 m.smap.Version, origProxyCnt, origTargetCount, 341 ) 342 tassert.CheckFatal(t, err) 343 tools.WaitForRebalanceByID(t, baseParams, rebID) 344 } 345 tools.ClearMaintenance(baseParams, tsi) 346 }() 347 tools.WaitForRebalanceByID(t, baseParams, rebID) 348 349 smap, err := tools.WaitForClusterState( 350 proxyURL, 351 "target removed from the cluster", 352 m.smap.Version, origProxyCnt, origTargetCount-1, tsi.ID(), 353 ) 354 tassert.CheckFatal(t, err) 355 m.smap = smap 356 357 m.gets(nil, false) 358 m.ensureNoGetErrors() 359 360 rebID, err = api.StopMaintenance(baseParams, actVal) 361 tassert.CheckFatal(t, err) 362 smap, err = tools.WaitForClusterState( 363 proxyURL, 364 "target joined", 365 m.smap.Version, origProxyCnt, origTargetCount, 366 ) 367 tassert.CheckFatal(t, err) 368 restored = true 369 m.smap = smap 370 371 tools.WaitForRebalanceByID(t, baseParams, rebID) 372 } 373 374 func TestMaintenanceGetWhileRebalance(t *testing.T) { 375 tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, Long: true}) 376 var ( 377 bck = cmn.Bck{Name: "maint-get-reb", Provider: apc.AIS} 378 m = &ioContext{ 379 t: t, 380 num: 5000, 381 fileSize: 1024, 382 fixedSize: true, 383 bck: bck, 384 numGetsEachFile: 1, 385 proxyURL: proxyURL, 386 } 387 actVal = &apc.ActValRmNode{} 388 proxyURL = tools.RandomProxyURL(t) 389 baseParams = tools.BaseAPIParams(proxyURL) 390 ) 391 392 m.initAndSaveState(true /*cleanup*/) 393 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 394 origProxyCnt, origTargetCount := m.smap.CountActivePs(), m.smap.CountActiveTs() 395 396 m.puts() 397 go m.getsUntilStop() 398 stopped := false 399 400 tsi, _ := m.smap.GetRandTarget() 401 tlog.Logf("Removing %s\n", tsi.StringEx()) 402 restored := false 403 actVal.DaemonID = tsi.ID() 404 rebID, err := api.StartMaintenance(baseParams, actVal) 405 tassert.CheckFatal(t, err) 406 defer func() { 407 if !stopped { 408 m.stopGets() 409 } 410 if !restored { 411 rebID, err := api.StopMaintenance(baseParams, actVal) 412 tassert.CheckFatal(t, err) 413 _, err = tools.WaitForClusterState( 414 proxyURL, 415 "target joined", 416 m.smap.Version, origProxyCnt, origTargetCount, 417 ) 418 tassert.CheckFatal(t, err) 419 tools.WaitForRebalanceByID(t, baseParams, rebID) 420 } 421 tools.ClearMaintenance(baseParams, tsi) 422 }() 423 tools.WaitForRebalanceByID(t, baseParams, rebID) 424 425 smap, err := tools.WaitForClusterState( 426 proxyURL, 427 "target removed from the cluster", 428 m.smap.Version, origProxyCnt, origTargetCount-1, tsi.ID(), 429 ) 430 tassert.CheckFatal(t, err) 431 m.smap = smap 432 433 m.stopGets() 434 stopped = true 435 m.ensureNoGetErrors() 436 437 rebID, err = api.StopMaintenance(baseParams, actVal) 438 tassert.CheckFatal(t, err) 439 restored = true 440 smap, err = tools.WaitForClusterState( 441 proxyURL, 442 "target joined", 443 m.smap.Version, origProxyCnt, origTargetCount, 444 ) 445 tassert.CheckFatal(t, err) 446 m.smap = smap 447 tools.WaitForRebalanceByID(t, baseParams, rebID) 448 } 449 450 func TestNodeShutdown(t *testing.T) { 451 for _, ty := range []string{apc.Proxy, apc.Target} { 452 t.Run(ty, func(t *testing.T) { 453 testNodeShutdown(t, ty) 454 time.Sleep(time.Second) 455 }) 456 } 457 } 458 459 // TODO -- FIXME: pass with a single target 460 func testNodeShutdown(t *testing.T, nodeType string) { 461 const minNumNodes = 2 462 var ( 463 proxyURL = tools.GetPrimaryURL() 464 smap = tools.GetClusterMap(t, proxyURL) 465 node *meta.Snode 466 err error 467 pdc, tdc int 468 469 origProxyCnt = smap.CountActivePs() 470 origTargetCount = smap.CountActiveTs() 471 ) 472 if nodeType == apc.Proxy { 473 if origProxyCnt < minNumNodes { 474 t.Skipf("%s requires at least %d gateway%s (have %d)", 475 t.Name(), minNumNodes, cos.Plural(minNumNodes), origProxyCnt) 476 } 477 node, err = smap.GetRandProxy(true) 478 pdc = 1 479 } else { 480 if origTargetCount < minNumNodes { 481 t.Skipf("%s requires at least %d target%s (have %d)", 482 t.Name(), minNumNodes, cos.Plural(minNumNodes), origTargetCount) 483 } 484 bck := cmn.Bck{Name: "shutdown-node" + cos.GenTie(), Provider: apc.AIS} 485 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 486 487 node, err = smap.GetRandTarget() 488 tdc = 1 489 } 490 tassert.CheckFatal(t, err) 491 492 // 1. Shutdown a random node. 493 _, cmd, rebID, err := tools.ShutdownNode(t, baseParams, node) 494 tassert.CheckFatal(t, err) 495 if nodeType == apc.Target && origTargetCount > 1 { 496 time.Sleep(time.Second) 497 xargs := xact.ArgsMsg{ID: rebID, Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 498 for range 3 { 499 status, err := api.WaitForXactionIC(baseParams, &xargs) 500 if err == nil { 501 tlog.Logf("%v\n", status) 502 break 503 } 504 herr := cmn.Err2HTTPErr(err) 505 tassert.Errorf(t, herr.Status == http.StatusNotFound, "expecting not found, got %+v", herr) 506 time.Sleep(time.Second) 507 } 508 } 509 510 smap, err = tools.WaitForClusterState(proxyURL, "shutdown node", 511 smap.Version, origProxyCnt-pdc, origTargetCount-tdc, node.ID()) 512 tassert.CheckFatal(t, err) 513 tassert.Fatalf(t, smap.GetNode(node.ID()) != nil, "node %s does not exist in %s after shutdown", node.ID(), smap) 514 tassert.Errorf(t, smap.GetNode(node.ID()).Flags.IsSet(meta.SnodeMaint), 515 "node should be in maintenance mode after shutdown") 516 517 // restarting before the daemon fully terminates may result in "bind: address already in use" 518 err = tools.WaitNodePubAddrNotInUse(node, time.Minute) 519 tassert.CheckFatal(t, err) 520 521 // 3. Start node again. 522 err = tools.RestoreNode(cmd, false, nodeType) 523 tassert.CheckError(t, err) 524 time.Sleep(5 * time.Second) // FIXME: wait-for(node started) 525 smap = tools.GetClusterMap(t, proxyURL) 526 tassert.Fatalf(t, smap.GetNode(node.ID()) != nil, "node %s does not exist in %s after restart", node.ID(), smap) 527 tassert.Errorf(t, smap.GetNode(node.ID()).Flags.IsSet(meta.SnodeMaint), 528 "node should be in maintenance mode after restart") 529 530 // 4. Remove the node from maintenance. 531 _, err = api.StopMaintenance(baseParams, &apc.ActValRmNode{DaemonID: node.ID()}) 532 tassert.CheckError(t, err) 533 _, err = tools.WaitForClusterState(proxyURL, "remove node from maintenance", 534 smap.Version, origProxyCnt, origTargetCount) 535 tassert.CheckError(t, err) 536 537 if nodeType == apc.Target { 538 tools.WaitForRebalAndResil(t, baseParams) 539 } 540 } 541 542 func TestShutdownListObjects(t *testing.T) { 543 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 544 var ( 545 bck = cmn.Bck{Name: "shutdown-list", Provider: apc.AIS} 546 m = &ioContext{ 547 t: t, 548 num: 1500, 549 fileSize: cos.KiB, 550 fixedSize: true, 551 bck: bck, 552 proxyURL: proxyURL, 553 } 554 proxyURL = tools.RandomProxyURL(t) 555 baseParams = tools.BaseAPIParams(proxyURL) 556 origEntries = make(map[string]*cmn.LsoEnt, m.num) 557 ) 558 559 m.initAndSaveState(true /*cleanup*/) 560 origTargetCount := m.smap.CountActiveTs() 561 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 562 m.puts() 563 564 // 1. Perform list-object and populate entries map. 565 msg := &apc.LsoMsg{} 566 msg.AddProps(apc.GetPropsChecksum, apc.GetPropsCopies, apc.GetPropsSize) 567 lst, err := api.ListObjects(baseParams, bck, msg, api.ListArgs{}) 568 tassert.CheckFatal(t, err) 569 tassert.Fatalf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d", 570 m.num, len(lst.Entries)) 571 for _, en := range lst.Entries { 572 origEntries[en.Name] = en 573 } 574 575 // 2. Shut down a random target. 576 tsi, _ := m.smap.GetRandTarget() 577 _, cmd, rebID, err := tools.ShutdownNode(t, baseParams, tsi) 578 tassert.CheckFatal(t, err) 579 580 // Restore target after test is over. 581 t.Cleanup(func() { 582 // restarting before the daemon fully terminates may result in "bind: address already in use" 583 err = tools.WaitNodePubAddrNotInUse(tsi, time.Minute) 584 tassert.CheckFatal(t, err) 585 586 err = tools.RestoreNode(cmd, false, apc.Target) 587 tassert.CheckError(t, err) 588 589 // first, activate target, second, wait-for-cluster-state 590 time.Sleep(time.Second) 591 592 _, err = api.StopMaintenance(baseParams, &apc.ActValRmNode{DaemonID: tsi.ID()}) 593 if err != nil { 594 time.Sleep(3 * time.Second) 595 _, err = api.StopMaintenance(baseParams, &apc.ActValRmNode{DaemonID: tsi.ID()}) 596 } 597 tassert.CheckError(t, err) 598 _, err = tools.WaitForClusterState(proxyURL, "remove node from maintenance", m.smap.Version, 0, origTargetCount) 599 tassert.CheckError(t, err) 600 601 tools.WaitForRebalAndResil(t, baseParams) 602 }) 603 604 if origTargetCount > 1 { 605 time.Sleep(time.Second) 606 xargs := xact.ArgsMsg{ID: rebID, Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 607 for range 3 { 608 status, err := api.WaitForXactionIC(baseParams, &xargs) 609 if err == nil { 610 tlog.Logf("%v\n", status) 611 break 612 } 613 herr := cmn.Err2HTTPErr(err) 614 tassert.Errorf(t, herr.Status == http.StatusNotFound, "expecting not found, got %+v", herr) 615 time.Sleep(time.Second) 616 } 617 } 618 619 m.smap, err = tools.WaitForClusterState(proxyURL, "target shutdown", m.smap.Version, 0, origTargetCount-1, tsi.ID()) 620 tassert.CheckFatal(t, err) 621 622 // 3. Check if we can list all the objects. 623 if m.smap.CountActiveTs() == 0 { 624 tlog.Logln("Shutdown single target - nothing to do") 625 return 626 } 627 tlog.Logln("Listing objects") 628 lst, err = api.ListObjects(baseParams, bck, msg, api.ListArgs{}) 629 tassert.CheckFatal(t, err) 630 tassert.Errorf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d", 631 m.num, len(lst.Entries)) 632 for _, en := range lst.Entries { 633 origEntry, ok := origEntries[en.Name] 634 tassert.Errorf(t, ok, "object %s missing in original entries", en.Name) 635 if en.Version != origEntry.Version || 636 en.Flags != origEntry.Flags || 637 en.Copies != origEntry.Copies { 638 t.Errorf("some fields of object %q, don't match: %#v v/s %#v ", en.Name, en, origEntry) 639 } 640 } 641 }