github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/globrun.go (about)

     1  // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package reb
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"net/http"
    12  	"path/filepath"
    13  	"runtime"
    14  	"sync"
    15  	ratomic "sync/atomic"
    16  	"time"
    17  
    18  	"github.com/NVIDIA/aistore/api/apc"
    19  	"github.com/NVIDIA/aistore/cmn"
    20  	"github.com/NVIDIA/aistore/cmn/atomic"
    21  	"github.com/NVIDIA/aistore/cmn/cos"
    22  	"github.com/NVIDIA/aistore/cmn/debug"
    23  	"github.com/NVIDIA/aistore/cmn/fname"
    24  	"github.com/NVIDIA/aistore/cmn/nlog"
    25  	"github.com/NVIDIA/aistore/cmn/prob"
    26  	"github.com/NVIDIA/aistore/core"
    27  	"github.com/NVIDIA/aistore/core/meta"
    28  	"github.com/NVIDIA/aistore/fs"
    29  	"github.com/NVIDIA/aistore/transport"
    30  	"github.com/NVIDIA/aistore/transport/bundle"
    31  	"github.com/NVIDIA/aistore/xact"
    32  	"github.com/NVIDIA/aistore/xact/xreg"
    33  	"github.com/NVIDIA/aistore/xact/xs"
    34  	jsoniter "github.com/json-iterator/go"
    35  	"golang.org/x/sync/errgroup"
    36  )
    37  
    38  const (
    39  	trname    = "reb"
    40  	trnamePsh = "pshreb" // broadcast stage notifications
    41  )
    42  
    43  // rebalance stage enum
    44  const (
    45  	rebStageInactive = iota
    46  	rebStageInit
    47  	rebStageTraverse
    48  	rebStageWaitAck
    49  	rebStageFin
    50  	rebStageFinStreams
    51  	rebStageDone
    52  	rebStageAbort // one of targets aborts the rebalancing (never set, only sent)
    53  )
    54  
    55  const maxWackTargets = 4
    56  
    57  var stages = map[uint32]string{
    58  	rebStageInactive:   "<inactive>",
    59  	rebStageInit:       "<init>",
    60  	rebStageTraverse:   "<traverse>",
    61  	rebStageWaitAck:    "<wack>",
    62  	rebStageFin:        "<fin>",
    63  	rebStageFinStreams: "<fin-streams>",
    64  	rebStageDone:       "<done>",
    65  	rebStageAbort:      "<abort>",
    66  }
    67  
    68  const fmtpend = "%s: newer rebalance[g%d] pending - not running"
    69  
    70  type (
    71  	Reb struct {
    72  		smap      ratomic.Pointer[meta.Smap] // next smap (new that'll become current after rebalance)
    73  		xreb      ratomic.Pointer[xs.Rebalance]
    74  		dm        *bundle.DataMover
    75  		pushes    *bundle.Streams // broadcast notifications
    76  		filterGFN *prob.Filter
    77  		semaCh    *cos.Semaphore
    78  		ecClient  *http.Client
    79  		stages    *nodeStages
    80  		lomacks   [cos.MultiSyncMapCount]*lomAcks
    81  		awaiting  struct {
    82  			targets meta.Nodes // targets for which we are waiting for
    83  			ts      int64      // last time we have recomputed
    84  			mtx     sync.Mutex
    85  		}
    86  		// (smap, xreb) + atomic state
    87  		rebID   atomic.Int64
    88  		nxtID   atomic.Int64
    89  		inQueue atomic.Int64
    90  		onAir   atomic.Int64
    91  		mu      sync.RWMutex
    92  		laterx  atomic.Bool
    93  	}
    94  	lomAcks struct {
    95  		mu *sync.Mutex
    96  		q  map[string]*core.LOM // on the wire, waiting for ACK
    97  	}
    98  	joggerBase struct {
    99  		m    *Reb
   100  		xreb *xs.Rebalance
   101  		wg   *sync.WaitGroup
   102  	}
   103  	rebJogger struct {
   104  		joggerBase
   105  		smap *meta.Smap
   106  		opts fs.WalkOpts
   107  		ver  int64
   108  	}
   109  	rebArgs struct {
   110  		smap   *meta.Smap
   111  		config *cmn.Config
   112  		apaths fs.MPI
   113  		id     int64
   114  		ecUsed bool
   115  	}
   116  )
   117  
   118  func New(config *cmn.Config) *Reb {
   119  	var (
   120  		reb = &Reb{
   121  			filterGFN: prob.NewDefaultFilter(),
   122  			stages:    newNodeStages(),
   123  		}
   124  		cargs = cmn.TransportArgs{Timeout: config.Client.Timeout.D()}
   125  	)
   126  	if config.Net.HTTP.UseHTTPS {
   127  		reb.ecClient = cmn.NewIntraClientTLS(cargs, config)
   128  	} else {
   129  		reb.ecClient = cmn.NewClient(cargs)
   130  	}
   131  	dmExtra := bundle.Extra{
   132  		RecvAck:     reb.recvAck,
   133  		Config:      config,
   134  		Compression: config.Rebalance.Compression,
   135  		Multiplier:  config.Rebalance.SbundleMult,
   136  	}
   137  	dm, err := bundle.NewDataMover(trname, reb.recvObj, cmn.OwtRebalance, dmExtra)
   138  	if err != nil {
   139  		cos.ExitLog(err)
   140  	}
   141  	debug.Assert(dm != nil)
   142  	reb.dm = dm
   143  
   144  	// serialize one global rebalance at a time
   145  	reb.semaCh = cos.NewSemaphore(1)
   146  	return reb
   147  }
   148  
   149  func (reb *Reb) regRecv() {
   150  	if err := reb.dm.RegRecv(); err != nil {
   151  		cos.ExitLog(err)
   152  	}
   153  	if err := transport.Handle(trnamePsh, reb.recvStageNtfn /*RecvObj*/); err != nil {
   154  		cos.ExitLog(err)
   155  	}
   156  }
   157  
   158  func (reb *Reb) unregRecv() {
   159  	reb.dm.UnregRecv()
   160  	err := transport.Unhandle(trnamePsh)
   161  	debug.AssertNoErr(err)
   162  }
   163  
   164  // run sequence: non-EC and EC global
   165  //
   166  // main method: serialized to run one at a time and goes through controlled enumerated stages
   167  // A note on stage management:
   168  //  1. Non-EC and EC rebalances run in parallel
   169  //  2. Execution starts after the `Reb` sets the current stage to rebStageTraverse
   170  //  3. Only EC rebalance changes the current stage
   171  //  4. Global rebalance performs checks such as `stage > rebStageTraverse` or
   172  //     `stage < rebStageWaitAck`. Since all EC stages are between
   173  //     `Traverse` and `WaitAck` non-EC rebalance does not "notice" stage changes.
   174  func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact) {
   175  	if reb.nxtID.Load() >= id {
   176  		return
   177  	}
   178  	reb.mu.Lock()
   179  	if reb.nxtID.Load() >= id {
   180  		reb.mu.Unlock()
   181  		return
   182  	}
   183  	debug.Assert(id > reb.rebID.Load())
   184  	reb.nxtID.Store(id)
   185  	reb.mu.Unlock()
   186  
   187  	logHdr := reb.logHdr(id, smap, true /*initializing*/)
   188  	nlog.Infoln(logHdr + ": initializing")
   189  
   190  	bmd := core.T.Bowner().Get()
   191  	rargs := &rebArgs{id: id, smap: smap, config: cmn.GCO.Get(), ecUsed: bmd.IsECUsed()}
   192  	if !reb.serialize(rargs, logHdr) {
   193  		return
   194  	}
   195  
   196  	reb.regRecv()
   197  
   198  	haveStreams := smap.HasActiveTs(core.T.SID())
   199  	if bmd.IsEmpty() {
   200  		haveStreams = false
   201  	}
   202  	if !reb.initRenew(rargs, notif, logHdr, haveStreams) {
   203  		reb.unregRecv()
   204  		reb.semaCh.Release()
   205  		return
   206  	}
   207  	if !haveStreams {
   208  		// cleanup and leave
   209  		nlog.Infof("%s: nothing to do: %s, %s", logHdr, smap.StringEx(), bmd.StringEx())
   210  		reb.stages.stage.Store(rebStageDone)
   211  		reb.unregRecv()
   212  		reb.semaCh.Release()
   213  		fs.RemoveMarker(fname.RebalanceMarker)
   214  		fs.RemoveMarker(fname.NodeRestartedPrev)
   215  		reb.xctn().Finish()
   216  		return
   217  	}
   218  
   219  	// abort all running `dtor.AbortRebRes` xactions (download, dsort, etl)
   220  	xreg.AbortByNewReb(errors.New("reason: starting " + reb.xctn().Name()))
   221  
   222  	// At this point, only one rebalance is running
   223  
   224  	onGFN()
   225  
   226  	errCnt := 0
   227  	err := reb.run(rargs)
   228  	if err == nil {
   229  		errCnt = reb.rebWaitAck(rargs)
   230  	} else {
   231  		nlog.Warningln(err)
   232  	}
   233  	reb.changeStage(rebStageFin)
   234  
   235  	for errCnt != 0 && !reb.xctn().IsAborted() {
   236  		errCnt = bcast(rargs, reb.waitFinExtended)
   237  	}
   238  
   239  	reb.fini(rargs, logHdr, err)
   240  
   241  	offGFN()
   242  }
   243  
   244  // To optimize goroutine creation:
   245  //  1. One bucket case just calls a single rebalance worker depending on
   246  //     whether a bucket is erasure coded (goroutine is not used).
   247  //  2. Multi-bucket rebalance may start both non-EC and EC in parallel.
   248  //     It then waits until everything finishes.
   249  func (reb *Reb) run(rargs *rebArgs) error {
   250  	// 6. Capture stats, start mpath joggers
   251  	reb.stages.stage.Store(rebStageTraverse)
   252  
   253  	// No EC-enabled buckets - run only regular rebalance
   254  	if !rargs.ecUsed {
   255  		nlog.Infof("starting g%d", rargs.id)
   256  		return reb.runNoEC(rargs)
   257  	}
   258  
   259  	// In all other cases run both rebalances simultaneously
   260  	group := &errgroup.Group{}
   261  	group.Go(func() error {
   262  		nlog.Infof("starting non-EC g%d", rargs.id)
   263  		return reb.runNoEC(rargs)
   264  	})
   265  	group.Go(func() error {
   266  		nlog.Infof("starting EC g%d", rargs.id)
   267  		return reb.runEC(rargs)
   268  	})
   269  	return group.Wait()
   270  }
   271  
   272  func (reb *Reb) serialize(rargs *rebArgs, logHdr string) bool {
   273  	// 1. check whether other targets are up and running
   274  	if errCnt := bcast(rargs, reb.pingTarget); errCnt > 0 {
   275  		return false
   276  	}
   277  	if rargs.smap.Version == 0 {
   278  		rargs.smap = core.T.Sowner().Get()
   279  	}
   280  	// 2. serialize global rebalance and start new xaction -
   281  	//    but only if the one that handles the current version is _not_ already in progress
   282  	if newerRMD, alreadyRunning := reb.acquire(rargs, logHdr); newerRMD || alreadyRunning {
   283  		return false
   284  	}
   285  	if rargs.smap.Version == 0 {
   286  		rargs.smap = core.T.Sowner().Get()
   287  	}
   288  	rargs.apaths = fs.GetAvail()
   289  	return true
   290  }
   291  
   292  func (reb *Reb) acquire(rargs *rebArgs, logHdr string) (newerRMD, alreadyRunning bool) {
   293  	var (
   294  		total    time.Duration
   295  		sleep    = rargs.config.Timeout.CplaneOperation.D()
   296  		maxTotal = max(20*sleep, 10*time.Second) // time to abort prev. streams
   297  		maxwt    = max(rargs.config.Rebalance.DestRetryTime.D(), 2*maxTotal)
   298  		errcnt   int
   299  		acquired bool
   300  	)
   301  	for {
   302  		select {
   303  		case <-reb.semaCh.TryAcquire():
   304  			acquired = true
   305  		default:
   306  			runtime.Gosched()
   307  		}
   308  		if id := reb.nxtID.Load(); id > rargs.id {
   309  			nlog.Warningf(fmtpend, logHdr, id)
   310  			newerRMD = true
   311  			if acquired {
   312  				reb.semaCh.Release()
   313  			}
   314  			return
   315  		}
   316  		if reb.rebID.Load() == rargs.id {
   317  			if acquired {
   318  				reb.semaCh.Release()
   319  			}
   320  			nlog.Warningf("%s: rebalance[g%d] is already running", logHdr, rargs.id)
   321  			alreadyRunning = true
   322  			return
   323  		}
   324  
   325  		if acquired { // ok
   326  			if errcnt > 1 {
   327  				nlog.Infof("%s: resolved (%d)", logHdr, errcnt)
   328  			}
   329  			return
   330  		}
   331  
   332  		// try to preempt
   333  		err := reb._preempt(rargs, logHdr, total, maxTotal, errcnt)
   334  		if err != nil {
   335  			if total > maxwt {
   336  				cos.ExitLog(err)
   337  			}
   338  			errcnt++
   339  		}
   340  		time.Sleep(sleep)
   341  		total += sleep
   342  	}
   343  }
   344  
   345  func (reb *Reb) _preempt(rargs *rebArgs, logHdr string, total, maxTotal time.Duration, errcnt int) (err error) {
   346  	entry := xreg.GetRunning(xreg.Flt{Kind: apc.ActRebalance})
   347  	if entry == nil {
   348  		var (
   349  			rebID   = reb.RebID()
   350  			rsmap   = reb.smap.Load()
   351  			rlogHdr = reb.logHdr(rebID, rsmap, true)
   352  			xreb    = reb.xctn()
   353  			s       string
   354  		)
   355  		if xreb != nil {
   356  			s = ", " + xreb.String()
   357  		}
   358  		err = fmt.Errorf("%s: acquire/release asymmetry vs %s%s", logHdr, rlogHdr, s)
   359  		if errcnt%2 == 1 {
   360  			nlog.Errorln(err)
   361  		}
   362  		return
   363  	}
   364  	otherXreb := entry.Get().(*xs.Rebalance) // running or previous
   365  	otherRebID := otherXreb.RebID()
   366  	if otherRebID >= rargs.id {
   367  		return
   368  	}
   369  	if !otherXreb.IsAborted() {
   370  		otherXreb.Abort(cmn.ErrXactRenewAbort)
   371  		nlog.Warningf("%s: aborting older %s", logHdr, otherXreb)
   372  		return
   373  	}
   374  	if total > maxTotal {
   375  		err = fmt.Errorf("%s: preempting older %s takes too much time", logHdr, otherXreb)
   376  		nlog.Errorln(err)
   377  		if xreb := reb.xctn(); xreb != nil && xreb.ID() == otherXreb.ID() {
   378  			debug.Assert(reb.dm.GetXact().ID() == otherXreb.ID())
   379  			nlog.Warningf("%s: aborting older streams...", logHdr)
   380  			reb.abortStreams()
   381  		}
   382  	}
   383  	return
   384  }
   385  
   386  func (reb *Reb) initRenew(rargs *rebArgs, notif *xact.NotifXact, logHdr string, haveStreams bool) bool {
   387  	if id := reb.nxtID.Load(); id > rargs.id {
   388  		nlog.Warningf(fmtpend, logHdr, id)
   389  		return false
   390  	}
   391  	rns := xreg.RenewRebalance(rargs.id)
   392  	debug.AssertNoErr(rns.Err)
   393  	if rns.IsRunning() {
   394  		return false
   395  	}
   396  	xctn := rns.Entry.Get()
   397  
   398  	notif.Xact = xctn
   399  	xctn.AddNotif(notif)
   400  
   401  	reb.mu.Lock()
   402  	if id := reb.nxtID.Load(); id > rargs.id {
   403  		reb.mu.Unlock()
   404  		nlog.Warningf(fmtpend, logHdr, id)
   405  		return false
   406  	}
   407  	reb.stages.stage.Store(rebStageInit)
   408  	xreb := xctn.(*xs.Rebalance)
   409  	reb.setXact(xreb)
   410  	reb.rebID.Store(rargs.id)
   411  
   412  	// check Smap _prior_ to opening streams
   413  	smap := core.T.Sowner().Get()
   414  	if smap.Version != rargs.smap.Version {
   415  		debug.Assert(smap.Version > rargs.smap.Version)
   416  		nlog.Errorf("Warning %s: %s post-init version change %s => %s", core.T, xreb, rargs.smap, smap)
   417  		// TODO: handle an unlikely corner case keeping in mind that not every change warants a different rebalance
   418  	}
   419  
   420  	// 3. init streams and data structures
   421  	if haveStreams {
   422  		reb.beginStreams(rargs.config)
   423  	}
   424  
   425  	if reb.awaiting.targets == nil {
   426  		reb.awaiting.targets = make(meta.Nodes, 0, maxWackTargets)
   427  	} else {
   428  		reb.awaiting.targets = reb.awaiting.targets[:0]
   429  	}
   430  	acks := reb.lomAcks()
   431  	for i := range len(acks) { // init lom acks
   432  		acks[i] = &lomAcks{mu: &sync.Mutex{}, q: make(map[string]*core.LOM, 64)}
   433  	}
   434  
   435  	// 4. create persistent mark
   436  	if fatalErr, writeErr := fs.PersistMarker(fname.RebalanceMarker); fatalErr != nil || writeErr != nil {
   437  		err := writeErr
   438  		if fatalErr != nil {
   439  			err = fatalErr
   440  		}
   441  		reb.endStreams(err)
   442  		xctn.Abort(err)
   443  		reb.mu.Unlock()
   444  		nlog.Errorf("FATAL: %v, WRITE: %v", fatalErr, writeErr)
   445  		return false
   446  	}
   447  
   448  	// 5. ready - can receive objects
   449  	reb.smap.Store(rargs.smap)
   450  	reb.stages.cleanup()
   451  
   452  	reb.mu.Unlock()
   453  	nlog.Infof("%s: running %s", reb.logHdr(rargs.id, rargs.smap), reb.xctn())
   454  	return true
   455  }
   456  
   457  func (reb *Reb) beginStreams(config *cmn.Config) {
   458  	debug.Assert(reb.stages.stage.Load() == rebStageInit)
   459  
   460  	xreb := reb.xctn()
   461  	reb.dm.SetXact(xreb)
   462  	reb.dm.Open()
   463  	pushArgs := bundle.Args{
   464  		Net:        reb.dm.NetC(),
   465  		Trname:     trnamePsh,
   466  		Multiplier: config.Rebalance.SbundleMult,
   467  		Extra:      &transport.Extra{SenderID: xreb.ID(), Config: config},
   468  	}
   469  	reb.pushes = bundle.New(transport.NewIntraDataClient(), pushArgs)
   470  
   471  	reb.laterx.Store(false)
   472  	reb.inQueue.Store(0)
   473  }
   474  
   475  func (reb *Reb) abortStreams() {
   476  	reb.dm.Abort()
   477  	reb.pushes.Abort()
   478  }
   479  
   480  func (reb *Reb) endStreams(err error) {
   481  	if reb.stages.stage.CAS(rebStageFin, rebStageFinStreams) {
   482  		reb.dm.Close(err)
   483  		reb.pushes.Close(true)
   484  	}
   485  }
   486  
   487  // when at least one bucket has EC enabled
   488  func (reb *Reb) runEC(rargs *rebArgs) error {
   489  	errCnt := bcast(rargs, reb.rxReady) // ignore timeout
   490  	xreb := reb.xctn()
   491  	if err := xreb.AbortErr(); err != nil {
   492  		logHdr := reb.logHdr(rargs.id, rargs.smap)
   493  		nlog.Infoln(logHdr, "abort ec rx-ready", err, "num-fail", errCnt)
   494  		return err
   495  	}
   496  	if errCnt > 0 {
   497  		logHdr := reb.logHdr(rargs.id, rargs.smap)
   498  		nlog.Errorln(logHdr, "ec rx-ready num-fail", errCnt) // unlikely
   499  	}
   500  
   501  	reb.runECjoggers()
   502  
   503  	if err := xreb.AbortErr(); err != nil {
   504  		logHdr := reb.logHdr(rargs.id, rargs.smap)
   505  		nlog.Infoln(logHdr, "abort ec-joggers", err)
   506  		return err
   507  	}
   508  	nlog.Infof("[%s] RebalanceEC done", core.T.SID())
   509  	return nil
   510  }
   511  
   512  // when not a single bucket has EC enabled
   513  func (reb *Reb) runNoEC(rargs *rebArgs) error {
   514  	errCnt := bcast(rargs, reb.rxReady) // ignore timeout
   515  	xreb := reb.xctn()
   516  	if err := xreb.AbortErr(); err != nil {
   517  		logHdr := reb.logHdr(rargs.id, rargs.smap)
   518  		nlog.Infoln(logHdr, "abort rx-ready", err, "num-fail", errCnt)
   519  		return err
   520  	}
   521  	if errCnt > 0 {
   522  		logHdr := reb.logHdr(rargs.id, rargs.smap)
   523  		nlog.Errorln(logHdr, "rx-ready num-fail", errCnt) // unlikely
   524  	}
   525  
   526  	wg := &sync.WaitGroup{}
   527  	ver := rargs.smap.Version
   528  	for _, mi := range rargs.apaths {
   529  		rl := &rebJogger{
   530  			joggerBase: joggerBase{m: reb, xreb: reb.xctn(), wg: wg},
   531  			smap:       rargs.smap, ver: ver,
   532  		}
   533  		wg.Add(1)
   534  		go rl.jog(mi)
   535  	}
   536  	wg.Wait()
   537  
   538  	if err := xreb.AbortErr(); err != nil {
   539  		logHdr := reb.logHdr(rargs.id, rargs.smap)
   540  		nlog.Infoln(logHdr, "abort joggers", err)
   541  		return err
   542  	}
   543  	if cmn.Rom.FastV(4, cos.SmoduleReb) {
   544  		nlog.Infof("finished rebalance walk (g%d)", rargs.id)
   545  	}
   546  	return nil
   547  }
   548  
   549  func (reb *Reb) rebWaitAck(rargs *rebArgs) (errCnt int) {
   550  	var (
   551  		cnt    int
   552  		logHdr = reb.logHdr(rargs.id, rargs.smap)
   553  		sleep  = rargs.config.Timeout.CplaneOperation.D()
   554  		maxwt  = rargs.config.Rebalance.DestRetryTime.D()
   555  		xreb   = reb.xctn()
   556  		smap   = rargs.smap
   557  	)
   558  	maxwt += time.Duration(int64(time.Minute) * int64(rargs.smap.CountTargets()/10))
   559  	maxwt = min(maxwt, rargs.config.Rebalance.DestRetryTime.D()*2)
   560  	reb.changeStage(rebStageWaitAck)
   561  
   562  	for {
   563  		curwt := time.Duration(0)
   564  		// poll for no more than maxwt while keeping track of the cumulative polling time via curwt
   565  		// (here and elsewhere)
   566  		for curwt < maxwt {
   567  			cnt = 0
   568  			var logged bool
   569  			for _, lomack := range reb.lomAcks() {
   570  				lomack.mu.Lock()
   571  				if l := len(lomack.q); l > 0 {
   572  					cnt += l
   573  					if !logged {
   574  						for _, lom := range lomack.q {
   575  							tsi, err := smap.HrwHash2T(lom.Digest())
   576  							if err == nil {
   577  								nlog.Infof("waiting for %s ACK from %s", lom, tsi.StringEx())
   578  								logged = true
   579  								break
   580  							}
   581  						}
   582  					}
   583  				}
   584  				lomack.mu.Unlock()
   585  				if err := xreb.AbortErr(); err != nil {
   586  					nlog.Infof("%s: abort wait-ack (%v)", logHdr, err)
   587  					return
   588  				}
   589  			}
   590  			if cnt == 0 {
   591  				nlog.Infof("%s: received all ACKs", logHdr)
   592  				break
   593  			}
   594  			nlog.Warningf("%s: waiting for %d ACKs", logHdr, cnt)
   595  			if err := xreb.AbortedAfter(sleep); err != nil {
   596  				nlog.Infof("%s: abort wait-ack (%v)", logHdr, err)
   597  				return
   598  			}
   599  
   600  			curwt += sleep
   601  		}
   602  		if cnt > 0 {
   603  			nlog.Warningf("%s: timed out waiting for %d ACK%s", logHdr, cnt, cos.Plural(cnt))
   604  		}
   605  		if xreb.IsAborted() {
   606  			return
   607  		}
   608  
   609  		// NOTE: requires locally migrated objects *not* to be removed at the src
   610  		aPaths, _ := fs.Get()
   611  		if len(aPaths) > len(rargs.apaths) {
   612  			nlog.Warningf("%s: mountpath changes detected (%d, %d)", logHdr, len(aPaths), len(rargs.apaths))
   613  		}
   614  
   615  		// 8. synchronize
   616  		nlog.Infof("%s: poll targets for: stage=(%s or %s***)", logHdr, stages[rebStageFin], stages[rebStageWaitAck])
   617  		errCnt = bcast(rargs, reb.waitFinExtended)
   618  		if xreb.IsAborted() {
   619  			return
   620  		}
   621  
   622  		// 9. retransmit if needed
   623  		cnt = reb.retransmit(rargs, xreb)
   624  		if cnt == 0 || reb.xctn().IsAborted() {
   625  			break
   626  		}
   627  		nlog.Warningf("%s: retransmitted %d, more wack...", logHdr, cnt)
   628  	}
   629  
   630  	return
   631  }
   632  
   633  func (reb *Reb) retransmit(rargs *rebArgs, xreb *xs.Rebalance) (cnt int) {
   634  	if reb._aborted(rargs) {
   635  		return
   636  	}
   637  	var (
   638  		rj = &rebJogger{joggerBase: joggerBase{
   639  			m: reb, xreb: reb.xctn(),
   640  			wg: &sync.WaitGroup{},
   641  		}, smap: rargs.smap}
   642  		loghdr = reb.logHdr(rargs.id, rargs.smap)
   643  	)
   644  	for _, lomAck := range reb.lomAcks() {
   645  		lomAck.mu.Lock()
   646  		for uname, lom := range lomAck.q {
   647  			if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil {
   648  				if cos.IsNotExist(err, 0) {
   649  					if cmn.Rom.FastV(5, cos.SmoduleReb) {
   650  						nlog.Infoln(loghdr, lom.Cname(), "not found")
   651  					}
   652  				} else {
   653  					err = cmn.NewErrFailedTo(core.T, "load", lom.Cname(), err)
   654  					rj.xreb.AddErr(err)
   655  				}
   656  				delete(lomAck.q, uname)
   657  				continue
   658  			}
   659  			tsi, _ := rargs.smap.HrwHash2T(lom.Digest())
   660  			if core.T.HeadObjT2T(lom, tsi) {
   661  				if cmn.Rom.FastV(4, cos.SmoduleReb) {
   662  					nlog.Infof("%s: HEAD ok %s at %s", loghdr, lom, tsi.StringEx())
   663  				}
   664  				delete(lomAck.q, uname)
   665  				continue
   666  			}
   667  			// retransmit
   668  			roc, err := _getReader(lom)
   669  			if err == nil {
   670  				err = rj.doSend(lom, tsi, roc)
   671  			}
   672  			if err == nil {
   673  				if cmn.Rom.FastV(4, cos.SmoduleReb) {
   674  					nlog.Infof("%s: retransmit %s => %s", loghdr, lom, tsi.StringEx())
   675  				}
   676  				cnt++
   677  			} else {
   678  				if cmn.IsErrStreamTerminated(err) {
   679  					xreb.Abort(err)
   680  					nlog.Errorf("%s: stream term-ed (%v)", loghdr, err)
   681  				} else {
   682  					err = fmt.Errorf("%s: failed to retransmit %s => %s: %w", loghdr, lom, tsi.StringEx(), err)
   683  					rj.xreb.AddErr(err)
   684  				}
   685  			}
   686  			if reb._aborted(rargs) {
   687  				lomAck.mu.Unlock()
   688  				return 0
   689  			}
   690  		}
   691  		lomAck.mu.Unlock()
   692  		if reb._aborted(rargs) {
   693  			return 0
   694  		}
   695  	}
   696  	return
   697  }
   698  
   699  func (reb *Reb) _aborted(rargs *rebArgs) (yes bool) {
   700  	yes = reb.xctn().IsAborted()
   701  	yes = yes || (rargs.smap.Version != core.T.Sowner().Get().Version)
   702  	return
   703  }
   704  
   705  func (reb *Reb) fini(rargs *rebArgs, logHdr string, err error) {
   706  	var stats core.Stats
   707  	if cmn.Rom.FastV(4, cos.SmoduleReb) {
   708  		nlog.Infof("finishing rebalance (reb_args: %s)", reb.logHdr(rargs.id, rargs.smap))
   709  	}
   710  	// prior to closing the streams
   711  	if q := reb.quiesce(rargs, rargs.config.Transport.QuiesceTime.D(), reb.nodesQuiescent); q != core.QuiAborted {
   712  		if errM := fs.RemoveMarker(fname.RebalanceMarker); errM == nil {
   713  			nlog.Infof("%s: %s removed marker ok", core.T, reb.xctn())
   714  		}
   715  		_ = fs.RemoveMarker(fname.NodeRestartedPrev)
   716  	}
   717  	reb.endStreams(err)
   718  	reb.filterGFN.Reset()
   719  	xreb := reb.xctn()
   720  	xreb.ToStats(&stats)
   721  	if stats.Objs > 0 || stats.OutObjs > 0 || stats.InObjs > 0 {
   722  		s, e := jsoniter.MarshalIndent(&stats, "", " ")
   723  		debug.AssertNoErr(e)
   724  		nlog.Infoln(string(s))
   725  	}
   726  	reb.stages.stage.Store(rebStageDone)
   727  	reb.stages.cleanup()
   728  
   729  	reb.unregRecv()
   730  	reb.semaCh.Release()
   731  	if !xreb.Finished() {
   732  		xreb.Finish()
   733  	}
   734  	nlog.Infof("%s: done (%s)", logHdr, xreb)
   735  }
   736  
   737  //////////////////////////////
   738  // rebJogger: global non-EC //
   739  //////////////////////////////
   740  
   741  func (rj *rebJogger) jog(mi *fs.Mountpath) {
   742  	// the jogger is running in separate goroutine, so use defer to be
   743  	// sure that `Done` is called even if the jogger crashes to avoid hang up
   744  	defer rj.wg.Done()
   745  	{
   746  		rj.opts.Mi = mi
   747  		rj.opts.CTs = []string{fs.ObjectType}
   748  		rj.opts.Callback = rj.visitObj
   749  		rj.opts.Sorted = false
   750  	}
   751  	bmd := core.T.Bowner().Get()
   752  	bmd.Range(nil, nil, rj.walkBck)
   753  }
   754  
   755  func (rj *rebJogger) walkBck(bck *meta.Bck) bool {
   756  	rj.opts.Bck.Copy(bck.Bucket())
   757  	err := fs.Walk(&rj.opts)
   758  	if err == nil {
   759  		return rj.xreb.IsAborted()
   760  	}
   761  	if rj.xreb.IsAborted() {
   762  		nlog.Infoln(rj.xreb.Name(), "aborting traversal")
   763  	} else {
   764  		nlog.Errorln(core.T.String(), rj.xreb.Name(), "failed to traverse", err)
   765  	}
   766  	return true
   767  }
   768  
   769  // send completion
   770  func (rj *rebJogger) objSentCallback(hdr *transport.ObjHdr, _ io.ReadCloser, arg any, err error) {
   771  	rj.m.inQueue.Dec()
   772  	if err == nil {
   773  		rj.xreb.OutObjsAdd(1, hdr.ObjAttrs.Size) // NOTE: double-counts retransmissions
   774  		return
   775  	}
   776  	// log err
   777  	if cmn.Rom.FastV(4, cos.SmoduleReb) || !cos.IsRetriableConnErr(err) {
   778  		if bundle.IsErrDestinationMissing(err) {
   779  			nlog.Errorf("%s: %v, %s", rj.xreb.Name(), err, rj.smap)
   780  		} else {
   781  			lom, ok := arg.(*core.LOM)
   782  			debug.Assert(ok)
   783  			nlog.Errorf("%s: %s failed to send %s: %v", core.T, rj.xreb.Name(), lom, err)
   784  		}
   785  	}
   786  }
   787  
   788  func (rj *rebJogger) visitObj(fqn string, de fs.DirEntry) error {
   789  	if err := rj.xreb.AbortErr(); err != nil {
   790  		nlog.Infoln(rj.xreb.Name(), "rj-walk-visit aborted", err)
   791  		return err
   792  	}
   793  	if de.IsDir() {
   794  		return nil
   795  	}
   796  	lom := core.AllocLOM(fqn)
   797  	err := rj._lwalk(lom, fqn)
   798  	if err != nil {
   799  		core.FreeLOM(lom)
   800  		if err == cmn.ErrSkip {
   801  			err = nil
   802  		}
   803  	}
   804  	return err
   805  }
   806  
   807  func (rj *rebJogger) _lwalk(lom *core.LOM, fqn string) error {
   808  	if err := lom.InitFQN(fqn, nil); err != nil {
   809  		if cmn.IsErrBucketLevel(err) {
   810  			return err
   811  		}
   812  		return cmn.ErrSkip
   813  	}
   814  	// skip EC.Enabled bucket - leave the job for EC rebalance
   815  	if lom.ECEnabled() {
   816  		return filepath.SkipDir
   817  	}
   818  	tsi, err := rj.smap.HrwHash2T(lom.Digest())
   819  	if err != nil {
   820  		return err
   821  	}
   822  	if tsi.ID() == core.T.SID() {
   823  		return cmn.ErrSkip
   824  	}
   825  
   826  	// skip objects that were already sent via GFN (due to probabilistic filtering
   827  	// false-positives, albeit rare, are still possible)
   828  	uname := cos.UnsafeB(lom.Uname())
   829  	if rj.m.filterGFN.Lookup(uname) {
   830  		rj.m.filterGFN.Delete(uname)
   831  		return cmn.ErrSkip
   832  	}
   833  	// prepare to send: rlock, load, new roc
   834  	var roc cos.ReadOpenCloser
   835  	if roc, err = _getReader(lom); err != nil {
   836  		return err
   837  	}
   838  
   839  	// transmit (unlock via transport completion => roc.Close)
   840  	rj.m.addLomAck(lom)
   841  	if err := rj.doSend(lom, tsi, roc); err != nil {
   842  		rj.m.delLomAck(lom, 0, false /*free LOM*/)
   843  		return err
   844  	}
   845  
   846  	return nil
   847  }
   848  
   849  // takes rlock and keeps it _iff_ successful
   850  func _getReader(lom *core.LOM) (roc cos.ReadOpenCloser, err error) {
   851  	lom.Lock(false)
   852  	if err = lom.Load(false /*cache it*/, true /*locked*/); err != nil {
   853  		lom.Unlock(false)
   854  		return
   855  	}
   856  	if lom.IsCopy() {
   857  		lom.Unlock(false)
   858  		err = cmn.ErrSkip
   859  		return
   860  	}
   861  	if lom.Checksum() == nil {
   862  		if _, err = lom.ComputeSetCksum(); err != nil {
   863  			lom.Unlock(false)
   864  			return
   865  		}
   866  	}
   867  	debug.Assert(lom.Checksum() != nil, lom.String())
   868  	return lom.NewDeferROC()
   869  }
   870  
   871  func (rj *rebJogger) doSend(lom *core.LOM, tsi *meta.Snode, roc cos.ReadOpenCloser) error {
   872  	var (
   873  		ack    = regularAck{rebID: rj.m.RebID(), daemonID: core.T.SID()}
   874  		o      = transport.AllocSend()
   875  		opaque = ack.NewPack()
   876  	)
   877  	o.Hdr.Bck.Copy(lom.Bucket())
   878  	o.Hdr.ObjName = lom.ObjName
   879  	o.Hdr.Opaque = opaque
   880  	o.Hdr.ObjAttrs.CopyFrom(lom.ObjAttrs(), false /*skip cksum*/)
   881  	o.Callback, o.CmplArg = rj.objSentCallback, lom
   882  	rj.m.inQueue.Inc()
   883  	return rj.m.dm.Send(o, roc, tsi)
   884  }