github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/maintain_test.go (about)

     1  // Package integration_test.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package integration_test
     6  
     7  import (
     8  	"fmt"
     9  	"net/http"
    10  	"path/filepath"
    11  	"testing"
    12  	"time"
    13  
    14  	"github.com/NVIDIA/aistore/api"
    15  	"github.com/NVIDIA/aistore/api/apc"
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/fname"
    19  	"github.com/NVIDIA/aistore/core/meta"
    20  	"github.com/NVIDIA/aistore/tools"
    21  	"github.com/NVIDIA/aistore/tools/readers"
    22  	"github.com/NVIDIA/aistore/tools/tassert"
    23  	"github.com/NVIDIA/aistore/tools/tlog"
    24  	"github.com/NVIDIA/aistore/xact"
    25  )
    26  
    27  func TestMaintenanceOnOff(t *testing.T) {
    28  	tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3})
    29  	proxyURL := tools.RandomProxyURL(t)
    30  	smap := tools.GetClusterMap(t, proxyURL)
    31  
    32  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
    33  
    34  	// Invalid target case
    35  	msg := &apc.ActValRmNode{DaemonID: "fakeID", SkipRebalance: true}
    36  	_, err := api.StartMaintenance(baseParams, msg)
    37  	tassert.Fatalf(t, err != nil, "Maintenance for invalid daemon ID succeeded")
    38  
    39  	mntTarget, _ := smap.GetRandTarget()
    40  	msg.DaemonID = mntTarget.ID()
    41  	baseParams := tools.BaseAPIParams(proxyURL)
    42  	_, err = api.StartMaintenance(baseParams, msg)
    43  	tassert.CheckFatal(t, err)
    44  	smap, err = tools.WaitForClusterState(proxyURL, "target in maintenance",
    45  		smap.Version, smap.CountActivePs(), smap.CountActiveTs()-1)
    46  	tassert.CheckFatal(t, err)
    47  	_, err = api.StopMaintenance(baseParams, msg)
    48  	tassert.CheckFatal(t, err)
    49  	_, err = tools.WaitForClusterState(proxyURL, "target is back",
    50  		smap.Version, smap.CountActivePs(), smap.CountTargets())
    51  	tassert.CheckFatal(t, err)
    52  	_, err = api.StopMaintenance(baseParams, msg)
    53  	tassert.Fatalf(t, err != nil, "Canceling maintenance must fail for 'normal' daemon")
    54  }
    55  
    56  func TestMaintenanceListObjects(t *testing.T) {
    57  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, MinTargets: 3})
    58  
    59  	var (
    60  		bck = cmn.Bck{Name: "maint-list", Provider: apc.AIS}
    61  		m   = &ioContext{
    62  			t:         t,
    63  			num:       1500,
    64  			fileSize:  cos.KiB,
    65  			fixedSize: true,
    66  			bck:       bck,
    67  			proxyURL:  proxyURL,
    68  		}
    69  		proxyURL    = tools.RandomProxyURL(t)
    70  		baseParams  = tools.BaseAPIParams(proxyURL)
    71  		origEntries = make(map[string]*cmn.LsoEnt, 1500)
    72  	)
    73  
    74  	m.initAndSaveState(true /*cleanup*/)
    75  	tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
    76  
    77  	m.puts()
    78  	// 1. Perform list-object and populate entries map
    79  	msg := &apc.LsoMsg{}
    80  	msg.AddProps(apc.GetPropsChecksum, apc.GetPropsVersion, apc.GetPropsCopies, apc.GetPropsSize)
    81  	lst, err := api.ListObjects(baseParams, bck, msg, api.ListArgs{})
    82  	tassert.CheckFatal(t, err)
    83  	tassert.Fatalf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d",
    84  		m.num, len(lst.Entries))
    85  	for _, en := range lst.Entries {
    86  		origEntries[en.Name] = en
    87  	}
    88  
    89  	// 2. Put a random target in maintenance mode
    90  	tsi, _ := m.smap.GetRandTarget()
    91  	tlog.Logf("Put target %s in maintenance mode\n", tsi.StringEx())
    92  	actVal := &apc.ActValRmNode{DaemonID: tsi.ID(), SkipRebalance: false}
    93  	rebID, err := api.StartMaintenance(baseParams, actVal)
    94  	tassert.CheckFatal(t, err)
    95  
    96  	defer func() {
    97  		rebID, err = api.StopMaintenance(baseParams, actVal)
    98  		tassert.CheckFatal(t, err)
    99  		_, err = tools.WaitForClusterState(proxyURL, "target is back",
   100  			m.smap.Version, m.smap.CountActivePs(), m.smap.CountTargets())
   101  		args := xact.ArgsMsg{ID: rebID, Timeout: tools.RebalanceTimeout}
   102  		_, err = api.WaitForXactionIC(baseParams, &args)
   103  		tassert.CheckFatal(t, err)
   104  	}()
   105  
   106  	m.smap, err = tools.WaitForClusterState(proxyURL, "target in maintenance",
   107  		m.smap.Version, m.smap.CountActivePs(), m.smap.CountActiveTs()-1)
   108  	tassert.CheckFatal(t, err)
   109  
   110  	tools.WaitForRebalanceByID(t, baseParams, rebID)
   111  
   112  	// 3. Check if we can list all the objects
   113  	lst, err = api.ListObjects(baseParams, bck, msg, api.ListArgs{})
   114  	tassert.CheckFatal(t, err)
   115  	tassert.Fatalf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d",
   116  		m.num, len(lst.Entries))
   117  	for _, en := range lst.Entries {
   118  		origEntry, ok := origEntries[en.Name]
   119  		tassert.Fatalf(t, ok, "object %s missing in original entries", en.Name)
   120  		if en.Checksum != origEntry.Checksum ||
   121  			en.Version != origEntry.Version ||
   122  			en.Flags != origEntry.Flags ||
   123  			en.Copies != origEntry.Copies {
   124  			t.Errorf("some fields of object %q, don't match: %#v v/s %#v ", en.Name, en, origEntry)
   125  		}
   126  	}
   127  }
   128  
   129  func TestMaintenanceMD(t *testing.T) {
   130  	// NOTE: this test requires local deployment as it checks local filesystem for VMDs.
   131  	tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, RequiredDeployment: tools.ClusterTypeLocal})
   132  
   133  	var (
   134  		proxyURL   = tools.RandomProxyURL(t)
   135  		smap       = tools.GetClusterMap(t, proxyURL)
   136  		baseParams = tools.BaseAPIParams(proxyURL)
   137  
   138  		dcmTarget, _  = smap.GetRandTarget()
   139  		allTgtsMpaths = tools.GetTargetsMountpaths(t, smap, baseParams)
   140  	)
   141  
   142  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   143  
   144  	t.Cleanup(func() {
   145  		args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout}
   146  		api.WaitForXactionIC(baseParams, &args)
   147  	})
   148  
   149  	tlog.Logf("Decommission %s\n", dcmTarget.StringEx())
   150  	cmd := tools.GetRestoreCmd(dcmTarget)
   151  	msg := &apc.ActValRmNode{DaemonID: dcmTarget.ID(), SkipRebalance: true, KeepInitialConfig: true}
   152  	_, err := api.DecommissionNode(baseParams, msg)
   153  	tassert.CheckFatal(t, err)
   154  
   155  	_, err = tools.WaitForClusterState(proxyURL, "target decommissioned", smap.Version, smap.CountActivePs(),
   156  		smap.CountTargets()-1)
   157  	if err == tools.ErrTimedOutStabilize {
   158  		tlog.Logf("Retrying - checking with primary %s ...\n", smap.Primary.StringEx())
   159  		proxyURL = smap.Primary.URL(cmn.NetPublic)
   160  		_, err = tools.WaitForClusterState(proxyURL, "target decommissioned", smap.Version, smap.CountActivePs(),
   161  			smap.CountTargets()-1)
   162  	}
   163  	if err != nil {
   164  		// fail the test but first, try to recover cluster membership
   165  		_ = tools.RestoreNode(cmd, false, "target")
   166  		time.Sleep(10 * time.Second)
   167  		tassert.CheckFatal(t, err)
   168  	}
   169  
   170  	vmdTargets := countVMDTargets(allTgtsMpaths)
   171  	tassert.Errorf(t, vmdTargets == smap.CountTargets()-1, "expected VMD to be found on %d targets, got %d.",
   172  		smap.CountTargets()-1, vmdTargets)
   173  
   174  	// restarting before the daemon fully terminates may result in "bind: address already in use"
   175  	err = tools.WaitNodePubAddrNotInUse(dcmTarget, time.Minute)
   176  	tassert.CheckFatal(t, err)
   177  
   178  	err = tools.RestoreNode(cmd, false, "target")
   179  	tassert.CheckFatal(t, err)
   180  	_, err = tools.WaitForClusterState(proxyURL, "target joined back", smap.Version, smap.CountActivePs(),
   181  		smap.CountTargets())
   182  	tassert.CheckFatal(t, err)
   183  
   184  	smap = tools.GetClusterMap(t, proxyURL)
   185  	vmdTargets = countVMDTargets(allTgtsMpaths)
   186  	tassert.Errorf(t, vmdTargets == smap.CountTargets(),
   187  		"expected VMD to be found on all %d targets after joining cluster, got %d",
   188  		smap.CountTargets(), vmdTargets)
   189  }
   190  
   191  func TestMaintenanceDecommissionRebalance(t *testing.T) {
   192  	tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, RequiredDeployment: tools.ClusterTypeLocal, Long: true})
   193  	var (
   194  		proxyURL   = tools.RandomProxyURL(t)
   195  		smap       = tools.GetClusterMap(t, proxyURL)
   196  		baseParams = tools.BaseAPIParams(proxyURL)
   197  		objCount   = 100
   198  		objPath    = "ic-decomm/"
   199  		fileSize   = cos.KiB
   200  
   201  		dcmTarget, _         = smap.GetRandTarget()
   202  		origTargetCount      = smap.CountTargets()
   203  		origActiveProxyCount = smap.CountActivePs()
   204  		bck                  = cmn.Bck{Name: t.Name(), Provider: apc.AIS}
   205  	)
   206  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   207  
   208  	tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
   209  	for i := range objCount {
   210  		objName := fmt.Sprintf("%sobj%04d", objPath, i)
   211  		r, _ := readers.NewRand(int64(fileSize), cos.ChecksumXXHash)
   212  		_, err := api.PutObject(&api.PutArgs{
   213  			BaseParams: baseParams,
   214  			Bck:        bck,
   215  			ObjName:    objName,
   216  			Reader:     r,
   217  			Size:       uint64(fileSize),
   218  		})
   219  		tassert.CheckFatal(t, err)
   220  	}
   221  
   222  	tlog.Logf("Decommission %s\n", dcmTarget.StringEx())
   223  	cmd := tools.GetRestoreCmd(dcmTarget)
   224  	msg := &apc.ActValRmNode{DaemonID: dcmTarget.ID(), RmUserData: true, KeepInitialConfig: true}
   225  	rebID, err := api.DecommissionNode(baseParams, msg)
   226  	tassert.CheckError(t, err)
   227  	_, err = tools.WaitForClusterState(proxyURL, "target decommissioned",
   228  		smap.Version, origActiveProxyCount, origTargetCount-1, dcmTarget.ID())
   229  
   230  	if err == tools.ErrTimedOutStabilize {
   231  		tlog.Logf("Retrying - checking with primary %s ...\n", smap.Primary.StringEx())
   232  		proxyURL = smap.Primary.URL(cmn.NetPublic)
   233  		_, err = tools.WaitForClusterState(proxyURL, "target decommissioned",
   234  			smap.Version, origActiveProxyCount, origTargetCount-1, dcmTarget.ID())
   235  	}
   236  	if err != nil {
   237  		// fail the test but first, try to recover cluster membership
   238  		_ = tools.RestoreNode(cmd, false, "target")
   239  		time.Sleep(10 * time.Second)
   240  		tassert.CheckFatal(t, err)
   241  	}
   242  
   243  	tools.WaitForRebalanceByID(t, baseParams, rebID)
   244  
   245  	msgList := &apc.LsoMsg{Prefix: objPath}
   246  	lst, err := api.ListObjects(baseParams, bck, msgList, api.ListArgs{})
   247  	tassert.CheckError(t, err)
   248  	if lst != nil && len(lst.Entries) != objCount {
   249  		t.Errorf("Wrong number of objects: have %d, expected %d", len(lst.Entries), objCount)
   250  	}
   251  
   252  	// restarting before the daemon fully terminates may result in "bind: address already in use"
   253  	err = tools.WaitNodePubAddrNotInUse(dcmTarget, time.Minute)
   254  	tassert.CheckFatal(t, err)
   255  
   256  	smap = tools.GetClusterMap(t, proxyURL)
   257  	err = tools.RestoreNode(cmd, false, "target")
   258  	tassert.CheckFatal(t, err)
   259  	smap, err = tools.WaitForClusterState(proxyURL, "target restored", smap.Version, 0, 0)
   260  	tassert.CheckFatal(t, err)
   261  
   262  	// If any node is in maintenance cancel the state
   263  	var dcm *meta.Snode
   264  	for _, node := range smap.Tmap {
   265  		if smap.InMaintOrDecomm(node) {
   266  			dcm = node
   267  			break
   268  		}
   269  	}
   270  	if dcm != nil {
   271  		tlog.Logf("Canceling maintenance for %s\n", dcm.ID())
   272  		args := xact.ArgsMsg{Kind: apc.ActRebalance}
   273  		err = api.AbortXaction(baseParams, &args)
   274  		tassert.CheckError(t, err)
   275  		val := &apc.ActValRmNode{DaemonID: dcm.ID()}
   276  		rebID, err = api.StopMaintenance(baseParams, val)
   277  		tassert.CheckError(t, err)
   278  		tools.WaitForRebalanceByID(t, baseParams, rebID)
   279  	} else {
   280  		args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout}
   281  		_, err = api.WaitForXactionIC(baseParams, &args)
   282  		tassert.CheckError(t, err)
   283  	}
   284  
   285  	lst, err = api.ListObjects(baseParams, bck, msgList, api.ListArgs{})
   286  	tassert.CheckError(t, err)
   287  	if lst != nil && len(lst.Entries) != objCount {
   288  		t.Errorf("Invalid number of objects: %d, expected %d", len(lst.Entries), objCount)
   289  	}
   290  }
   291  
   292  func countVMDTargets(tsMpaths map[*meta.Snode][]string) (total int) {
   293  	for _, mpaths := range tsMpaths {
   294  		for _, mpath := range mpaths {
   295  			if err := cos.Stat(filepath.Join(mpath, fname.Vmd)); err == nil {
   296  				total++
   297  				break
   298  			}
   299  		}
   300  	}
   301  	return
   302  }
   303  
   304  func TestMaintenanceRebalance(t *testing.T) {
   305  	tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, Long: true})
   306  	var (
   307  		bck = cmn.Bck{Name: "maint-reb", Provider: apc.AIS}
   308  		m   = &ioContext{
   309  			t:               t,
   310  			num:             30,
   311  			fileSize:        512,
   312  			fixedSize:       true,
   313  			bck:             bck,
   314  			numGetsEachFile: 1,
   315  			proxyURL:        proxyURL,
   316  		}
   317  		actVal     = &apc.ActValRmNode{}
   318  		proxyURL   = tools.RandomProxyURL(t)
   319  		baseParams = tools.BaseAPIParams(proxyURL)
   320  	)
   321  
   322  	m.initAndSaveState(true /*cleanup*/)
   323  	tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
   324  	origProxyCnt, origTargetCount := m.smap.CountActivePs(), m.smap.CountActiveTs()
   325  
   326  	m.puts()
   327  	tsi, _ := m.smap.GetRandTarget()
   328  	tlog.Logf("Removing %s\n", tsi.StringEx())
   329  	restored := false
   330  	actVal.DaemonID = tsi.ID()
   331  	rebID, err := api.StartMaintenance(baseParams, actVal)
   332  	tassert.CheckError(t, err)
   333  	defer func() {
   334  		if !restored {
   335  			rebID, err := api.StopMaintenance(baseParams, actVal)
   336  			tassert.CheckError(t, err)
   337  			_, err = tools.WaitForClusterState(
   338  				proxyURL,
   339  				"target joined (2nd attempt)",
   340  				m.smap.Version, origProxyCnt, origTargetCount,
   341  			)
   342  			tassert.CheckFatal(t, err)
   343  			tools.WaitForRebalanceByID(t, baseParams, rebID)
   344  		}
   345  		tools.ClearMaintenance(baseParams, tsi)
   346  	}()
   347  	tools.WaitForRebalanceByID(t, baseParams, rebID)
   348  
   349  	smap, err := tools.WaitForClusterState(
   350  		proxyURL,
   351  		"target removed from the cluster",
   352  		m.smap.Version, origProxyCnt, origTargetCount-1, tsi.ID(),
   353  	)
   354  	tassert.CheckFatal(t, err)
   355  	m.smap = smap
   356  
   357  	m.gets(nil, false)
   358  	m.ensureNoGetErrors()
   359  
   360  	rebID, err = api.StopMaintenance(baseParams, actVal)
   361  	tassert.CheckFatal(t, err)
   362  	smap, err = tools.WaitForClusterState(
   363  		proxyURL,
   364  		"target joined",
   365  		m.smap.Version, origProxyCnt, origTargetCount,
   366  	)
   367  	tassert.CheckFatal(t, err)
   368  	restored = true
   369  	m.smap = smap
   370  
   371  	tools.WaitForRebalanceByID(t, baseParams, rebID)
   372  }
   373  
   374  func TestMaintenanceGetWhileRebalance(t *testing.T) {
   375  	tools.CheckSkip(t, &tools.SkipTestArgs{MinTargets: 3, Long: true})
   376  	var (
   377  		bck = cmn.Bck{Name: "maint-get-reb", Provider: apc.AIS}
   378  		m   = &ioContext{
   379  			t:               t,
   380  			num:             5000,
   381  			fileSize:        1024,
   382  			fixedSize:       true,
   383  			bck:             bck,
   384  			numGetsEachFile: 1,
   385  			proxyURL:        proxyURL,
   386  		}
   387  		actVal     = &apc.ActValRmNode{}
   388  		proxyURL   = tools.RandomProxyURL(t)
   389  		baseParams = tools.BaseAPIParams(proxyURL)
   390  	)
   391  
   392  	m.initAndSaveState(true /*cleanup*/)
   393  	tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
   394  	origProxyCnt, origTargetCount := m.smap.CountActivePs(), m.smap.CountActiveTs()
   395  
   396  	m.puts()
   397  	go m.getsUntilStop()
   398  	stopped := false
   399  
   400  	tsi, _ := m.smap.GetRandTarget()
   401  	tlog.Logf("Removing %s\n", tsi.StringEx())
   402  	restored := false
   403  	actVal.DaemonID = tsi.ID()
   404  	rebID, err := api.StartMaintenance(baseParams, actVal)
   405  	tassert.CheckFatal(t, err)
   406  	defer func() {
   407  		if !stopped {
   408  			m.stopGets()
   409  		}
   410  		if !restored {
   411  			rebID, err := api.StopMaintenance(baseParams, actVal)
   412  			tassert.CheckFatal(t, err)
   413  			_, err = tools.WaitForClusterState(
   414  				proxyURL,
   415  				"target joined",
   416  				m.smap.Version, origProxyCnt, origTargetCount,
   417  			)
   418  			tassert.CheckFatal(t, err)
   419  			tools.WaitForRebalanceByID(t, baseParams, rebID)
   420  		}
   421  		tools.ClearMaintenance(baseParams, tsi)
   422  	}()
   423  	tools.WaitForRebalanceByID(t, baseParams, rebID)
   424  
   425  	smap, err := tools.WaitForClusterState(
   426  		proxyURL,
   427  		"target removed from the cluster",
   428  		m.smap.Version, origProxyCnt, origTargetCount-1, tsi.ID(),
   429  	)
   430  	tassert.CheckFatal(t, err)
   431  	m.smap = smap
   432  
   433  	m.stopGets()
   434  	stopped = true
   435  	m.ensureNoGetErrors()
   436  
   437  	rebID, err = api.StopMaintenance(baseParams, actVal)
   438  	tassert.CheckFatal(t, err)
   439  	restored = true
   440  	smap, err = tools.WaitForClusterState(
   441  		proxyURL,
   442  		"target joined",
   443  		m.smap.Version, origProxyCnt, origTargetCount,
   444  	)
   445  	tassert.CheckFatal(t, err)
   446  	m.smap = smap
   447  	tools.WaitForRebalanceByID(t, baseParams, rebID)
   448  }
   449  
   450  func TestNodeShutdown(t *testing.T) {
   451  	for _, ty := range []string{apc.Proxy, apc.Target} {
   452  		t.Run(ty, func(t *testing.T) {
   453  			testNodeShutdown(t, ty)
   454  			time.Sleep(time.Second)
   455  		})
   456  	}
   457  }
   458  
   459  // TODO -- FIXME: pass with a single target
   460  func testNodeShutdown(t *testing.T, nodeType string) {
   461  	const minNumNodes = 2
   462  	var (
   463  		proxyURL = tools.GetPrimaryURL()
   464  		smap     = tools.GetClusterMap(t, proxyURL)
   465  		node     *meta.Snode
   466  		err      error
   467  		pdc, tdc int
   468  
   469  		origProxyCnt    = smap.CountActivePs()
   470  		origTargetCount = smap.CountActiveTs()
   471  	)
   472  	if nodeType == apc.Proxy {
   473  		if origProxyCnt < minNumNodes {
   474  			t.Skipf("%s requires at least %d gateway%s (have %d)",
   475  				t.Name(), minNumNodes, cos.Plural(minNumNodes), origProxyCnt)
   476  		}
   477  		node, err = smap.GetRandProxy(true)
   478  		pdc = 1
   479  	} else {
   480  		if origTargetCount < minNumNodes {
   481  			t.Skipf("%s requires at least %d target%s (have %d)",
   482  				t.Name(), minNumNodes, cos.Plural(minNumNodes), origTargetCount)
   483  		}
   484  		bck := cmn.Bck{Name: "shutdown-node" + cos.GenTie(), Provider: apc.AIS}
   485  		tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
   486  
   487  		node, err = smap.GetRandTarget()
   488  		tdc = 1
   489  	}
   490  	tassert.CheckFatal(t, err)
   491  
   492  	// 1. Shutdown a random node.
   493  	_, cmd, rebID, err := tools.ShutdownNode(t, baseParams, node)
   494  	tassert.CheckFatal(t, err)
   495  	if nodeType == apc.Target && origTargetCount > 1 {
   496  		time.Sleep(time.Second)
   497  		xargs := xact.ArgsMsg{ID: rebID, Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout}
   498  		for range 3 {
   499  			status, err := api.WaitForXactionIC(baseParams, &xargs)
   500  			if err == nil {
   501  				tlog.Logf("%v\n", status)
   502  				break
   503  			}
   504  			herr := cmn.Err2HTTPErr(err)
   505  			tassert.Errorf(t, herr.Status == http.StatusNotFound, "expecting not found, got %+v", herr)
   506  			time.Sleep(time.Second)
   507  		}
   508  	}
   509  
   510  	smap, err = tools.WaitForClusterState(proxyURL, "shutdown node",
   511  		smap.Version, origProxyCnt-pdc, origTargetCount-tdc, node.ID())
   512  	tassert.CheckFatal(t, err)
   513  	tassert.Fatalf(t, smap.GetNode(node.ID()) != nil, "node %s does not exist in %s after shutdown", node.ID(), smap)
   514  	tassert.Errorf(t, smap.GetNode(node.ID()).Flags.IsSet(meta.SnodeMaint),
   515  		"node should be in maintenance mode after shutdown")
   516  
   517  	// restarting before the daemon fully terminates may result in "bind: address already in use"
   518  	err = tools.WaitNodePubAddrNotInUse(node, time.Minute)
   519  	tassert.CheckFatal(t, err)
   520  
   521  	// 3. Start node again.
   522  	err = tools.RestoreNode(cmd, false, nodeType)
   523  	tassert.CheckError(t, err)
   524  	time.Sleep(5 * time.Second) // FIXME: wait-for(node started)
   525  	smap = tools.GetClusterMap(t, proxyURL)
   526  	tassert.Fatalf(t, smap.GetNode(node.ID()) != nil, "node %s does not exist in %s after restart", node.ID(), smap)
   527  	tassert.Errorf(t, smap.GetNode(node.ID()).Flags.IsSet(meta.SnodeMaint),
   528  		"node should be in maintenance mode after restart")
   529  
   530  	// 4. Remove the node from maintenance.
   531  	_, err = api.StopMaintenance(baseParams, &apc.ActValRmNode{DaemonID: node.ID()})
   532  	tassert.CheckError(t, err)
   533  	_, err = tools.WaitForClusterState(proxyURL, "remove node from maintenance",
   534  		smap.Version, origProxyCnt, origTargetCount)
   535  	tassert.CheckError(t, err)
   536  
   537  	if nodeType == apc.Target {
   538  		tools.WaitForRebalAndResil(t, baseParams)
   539  	}
   540  }
   541  
   542  func TestShutdownListObjects(t *testing.T) {
   543  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
   544  	var (
   545  		bck = cmn.Bck{Name: "shutdown-list", Provider: apc.AIS}
   546  		m   = &ioContext{
   547  			t:         t,
   548  			num:       1500,
   549  			fileSize:  cos.KiB,
   550  			fixedSize: true,
   551  			bck:       bck,
   552  			proxyURL:  proxyURL,
   553  		}
   554  		proxyURL    = tools.RandomProxyURL(t)
   555  		baseParams  = tools.BaseAPIParams(proxyURL)
   556  		origEntries = make(map[string]*cmn.LsoEnt, m.num)
   557  	)
   558  
   559  	m.initAndSaveState(true /*cleanup*/)
   560  	origTargetCount := m.smap.CountActiveTs()
   561  	tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
   562  	m.puts()
   563  
   564  	// 1. Perform list-object and populate entries map.
   565  	msg := &apc.LsoMsg{}
   566  	msg.AddProps(apc.GetPropsChecksum, apc.GetPropsCopies, apc.GetPropsSize)
   567  	lst, err := api.ListObjects(baseParams, bck, msg, api.ListArgs{})
   568  	tassert.CheckFatal(t, err)
   569  	tassert.Fatalf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d",
   570  		m.num, len(lst.Entries))
   571  	for _, en := range lst.Entries {
   572  		origEntries[en.Name] = en
   573  	}
   574  
   575  	// 2. Shut down a random target.
   576  	tsi, _ := m.smap.GetRandTarget()
   577  	_, cmd, rebID, err := tools.ShutdownNode(t, baseParams, tsi)
   578  	tassert.CheckFatal(t, err)
   579  
   580  	// Restore target after test is over.
   581  	t.Cleanup(func() {
   582  		// restarting before the daemon fully terminates may result in "bind: address already in use"
   583  		err = tools.WaitNodePubAddrNotInUse(tsi, time.Minute)
   584  		tassert.CheckFatal(t, err)
   585  
   586  		err = tools.RestoreNode(cmd, false, apc.Target)
   587  		tassert.CheckError(t, err)
   588  
   589  		// first, activate target, second, wait-for-cluster-state
   590  		time.Sleep(time.Second)
   591  
   592  		_, err = api.StopMaintenance(baseParams, &apc.ActValRmNode{DaemonID: tsi.ID()})
   593  		if err != nil {
   594  			time.Sleep(3 * time.Second)
   595  			_, err = api.StopMaintenance(baseParams, &apc.ActValRmNode{DaemonID: tsi.ID()})
   596  		}
   597  		tassert.CheckError(t, err)
   598  		_, err = tools.WaitForClusterState(proxyURL, "remove node from maintenance", m.smap.Version, 0, origTargetCount)
   599  		tassert.CheckError(t, err)
   600  
   601  		tools.WaitForRebalAndResil(t, baseParams)
   602  	})
   603  
   604  	if origTargetCount > 1 {
   605  		time.Sleep(time.Second)
   606  		xargs := xact.ArgsMsg{ID: rebID, Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout}
   607  		for range 3 {
   608  			status, err := api.WaitForXactionIC(baseParams, &xargs)
   609  			if err == nil {
   610  				tlog.Logf("%v\n", status)
   611  				break
   612  			}
   613  			herr := cmn.Err2HTTPErr(err)
   614  			tassert.Errorf(t, herr.Status == http.StatusNotFound, "expecting not found, got %+v", herr)
   615  			time.Sleep(time.Second)
   616  		}
   617  	}
   618  
   619  	m.smap, err = tools.WaitForClusterState(proxyURL, "target shutdown", m.smap.Version, 0, origTargetCount-1, tsi.ID())
   620  	tassert.CheckFatal(t, err)
   621  
   622  	// 3. Check if we can list all the objects.
   623  	if m.smap.CountActiveTs() == 0 {
   624  		tlog.Logln("Shutdown single target - nothing to do")
   625  		return
   626  	}
   627  	tlog.Logln("Listing objects")
   628  	lst, err = api.ListObjects(baseParams, bck, msg, api.ListArgs{})
   629  	tassert.CheckFatal(t, err)
   630  	tassert.Errorf(t, len(lst.Entries) == m.num, "list-object should return %d objects - returned %d",
   631  		m.num, len(lst.Entries))
   632  	for _, en := range lst.Entries {
   633  		origEntry, ok := origEntries[en.Name]
   634  		tassert.Errorf(t, ok, "object %s missing in original entries", en.Name)
   635  		if en.Version != origEntry.Version ||
   636  			en.Flags != origEntry.Flags ||
   637  			en.Copies != origEntry.Copies {
   638  			t.Errorf("some fields of object %q, don't match: %#v v/s %#v ", en.Name, en, origEntry)
   639  		}
   640  	}
   641  }