github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/rebmeta.go

github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/rebmeta.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"fmt"
     9  	"os"
    10  	"path/filepath"
    11  	"sync"
    12  	ratomic "sync/atomic"
    13  
    14  	"github.com/NVIDIA/aistore/api/apc"
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/atomic"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/fname"
    20  	"github.com/NVIDIA/aistore/cmn/jsp"
    21  	"github.com/NVIDIA/aistore/cmn/nlog"
    22  	"github.com/NVIDIA/aistore/core/meta"
    23  	"github.com/NVIDIA/aistore/memsys"
    24  	"github.com/NVIDIA/aistore/nl"
    25  	"github.com/NVIDIA/aistore/xact"
    26  )
    27  
    28  // Rebalance metadata is distributed to trigger global (a.k.a. cluster) rebalance.
    29  // This (distribution) happens:
    30  // - when a new target node joins cluster;
    31  // - at startup, when cluster detects unfinished (aborted) rebalance;
    32  // - when we remove a target while some bucket(s) are erasure-coded
    33  //   (we then must redistribute slices);
    34  // - upon bucket rename:
    35  //    1. bucket is renamed (and the paths of the objects change)
    36  //    2. rebalance must be started to redistribute the objects to the targets
    37  //       depending on HRW;
    38  // - when requested by user (`ais start rebalance` or REST API);
    39  // - upon target node powercycle (and more).
    40  
    41  type (
    42  	// rebMD is revs (see metasync) which is distributed by primary proxy to
    43  	// the targets. It is distributed when some kind of rebalance is required.
    44  	rebMD struct {
    45  		meta.RMD
    46  	}
    47  
    48  	// rmdOwner is used to keep the information about the rebalances. Currently
    49  	// it keeps the Version of the latest rebalance.
    50  	rmdOwner struct {
    51  		cluID string
    52  		fpath string
    53  		sync.Mutex
    54  		rmd         ratomic.Pointer[rebMD]
    55  		interrupted atomic.Bool // when joining target reports interrupted rebalance
    56  		starting    atomic.Bool // when starting up
    57  	}
    58  
    59  	rmdModifier struct {
    60  		pre   func(ctx *rmdModifier, clone *rebMD)
    61  		final func(ctx *rmdModifier, clone *rebMD)
    62  
    63  		prev  *rebMD // pre-modification rmd
    64  		cur   *rebMD // CoW clone
    65  		rebID string // cluster-wide rebalance ID, "g[uuid]" in the logs
    66  
    67  		cluID   string // cluster ID (== smap.UUID) - never changes
    68  		p       *proxy
    69  		smapCtx *smapModifier
    70  		wait    bool
    71  	}
    72  )
    73  
    74  // interface guard
    75  var _ revs = (*rebMD)(nil)
    76  
    77  // as revs
    78  func (*rebMD) tag() string       { return revsRMDTag }
    79  func (r *rebMD) version() int64  { return r.Version }
    80  func (r *rebMD) marshal() []byte { return cos.MustMarshal(r) }
    81  func (*rebMD) jit(p *proxy) revs { return p.owner.rmd.get() }
    82  func (*rebMD) sgl() *memsys.SGL  { return nil }
    83  
    84  func (r *rebMD) inc() { r.Version++ }
    85  
    86  func (r *rebMD) clone() *rebMD {
    87  	dst := &rebMD{}
    88  	cos.CopyStruct(dst, r)
    89  	return dst
    90  }
    91  
    92  func (r *rebMD) String() string {
    93  	if r == nil {
    94  		return "RMD <nil>"
    95  	}
    96  	if len(r.TargetIDs) == 0 && r.Resilver == "" {
    97  		return fmt.Sprintf("RMD v%d[%s]", r.Version, r.CluID)
    98  	}
    99  	var s string
   100  	if r.Resilver != "" {
   101  		s = ", " + r.Resilver
   102  	}
   103  	return fmt.Sprintf("RMD v%d[%s, %v%s]", r.Version, r.CluID, r.TargetIDs, s)
   104  }
   105  
   106  //////////////
   107  // rmdOwner //
   108  //////////////
   109  
   110  func newRMDOwner(config *cmn.Config) *rmdOwner {
   111  	rmdo := &rmdOwner{fpath: filepath.Join(config.ConfigDir, fname.Rmd)}
   112  	rmdo.put(&rebMD{})
   113  	return rmdo
   114  }
   115  
   116  func (r *rmdOwner) persist(rmd *rebMD) error {
   117  	return jsp.SaveMeta(r.fpath, rmd, nil /*wto*/)
   118  }
   119  
   120  func (r *rmdOwner) load() {
   121  	rmd := &rebMD{}
   122  	_, err := jsp.LoadMeta(r.fpath, rmd)
   123  	if err == nil {
   124  		r.put(rmd)
   125  		return
   126  	}
   127  	if !os.IsNotExist(err) {
   128  		nlog.Errorln("failed to load RMD:", err)
   129  		nlog.Infoln("Warning: make sure to properly decommission previously deployed clusters, proceeding anyway...")
   130  	}
   131  }
   132  
   133  func (r *rmdOwner) put(rmd *rebMD) { r.rmd.Store(rmd) }
   134  func (r *rmdOwner) get() *rebMD    { return r.rmd.Load() }
   135  
   136  func (r *rmdOwner) modify(ctx *rmdModifier) (clone *rebMD, err error) {
   137  	r.Lock()
   138  	clone, err = r.do(ctx)
   139  	r.Unlock()
   140  
   141  	if err == nil && ctx.final != nil {
   142  		ctx.final(ctx, clone)
   143  	}
   144  	return
   145  }
   146  
   147  const rmdFromAnother = `
   148  %s: RMD v%d (cluster ID %q) belongs to a different cluster %q
   149  
   150  -----------------
   151  To troubleshoot:
   152  1. first, make sure you are not trying to run two different clusters that utilize (or include) the same machine
   153  2. remove possibly misplaced RMD from the %s (located at %s)
   154  3. restart %s
   155  -----------------`
   156  
   157  func (r *rmdOwner) newClusterIntegrityErr(node, otherCID, haveCID string, version int64) (err error) {
   158  	return fmt.Errorf(rmdFromAnother, node, version, haveCID, otherCID, node, r.fpath, node)
   159  }
   160  
   161  func (r *rmdOwner) do(ctx *rmdModifier) (clone *rebMD, err error) {
   162  	ctx.prev = r.get()
   163  
   164  	if r.cluID == "" {
   165  		r.cluID = ctx.cluID
   166  	}
   167  	if ctx.smapCtx == nil {
   168  		return
   169  	}
   170  	if r.cluID == "" {
   171  		r.cluID = ctx.smapCtx.smap.UUID
   172  	} else if r.cluID != ctx.smapCtx.smap.UUID {
   173  		err := r.newClusterIntegrityErr("primary", ctx.smapCtx.smap.UUID, r.cluID, ctx.prev.Version)
   174  		cos.ExitLog(err) // FATAL
   175  	}
   176  
   177  	clone = ctx.prev.clone()
   178  	clone.TargetIDs = nil
   179  	clone.Resilver = ""
   180  	clone.CluID = r.cluID
   181  	debug.Assert(cos.IsValidUUID(clone.CluID), clone.CluID)
   182  	ctx.pre(ctx, clone) // `pre` callback
   183  
   184  	if err = r.persist(clone); err == nil {
   185  		r.put(clone)
   186  	}
   187  	ctx.cur = clone
   188  	ctx.rebID = xact.RebID2S(clone.Version) // new rebID
   189  	return
   190  }
   191  
   192  /////////////////
   193  // rmdModifier //
   194  /////////////////
   195  
   196  func rmdInc(_ *rmdModifier, clone *rebMD) { clone.inc() }
   197  
   198  // via `rmdModifier.final`
   199  func rmdSync(m *rmdModifier, clone *rebMD) {
   200  	debug.Assert(m.cur == clone)
   201  	m.listen(nil)
   202  	msg := &aisMsg{ActMsg: apc.ActMsg{Action: apc.ActRebalance}, UUID: m.rebID} // user-requested rebalance
   203  	wg := m.p.metasyncer.sync(revsPair{m.cur, msg})
   204  	if m.wait {
   205  		wg.Wait()
   206  	}
   207  }
   208  
   209  // see `receiveRMD` (upon termination, notify IC)
   210  func (m *rmdModifier) listen(cb func(nl nl.Listener)) {
   211  	nl := xact.NewXactNL(m.rebID, apc.ActRebalance, &m.smapCtx.smap.Smap, nil)
   212  	nl = nl.WithCause(m.smapCtx.msg.Action)
   213  	nl.SetOwner(equalIC)
   214  
   215  	nl.F = m.log
   216  	if cb != nil {
   217  		nl.F = cb
   218  	}
   219  	err := m.p.notifs.add(nl)
   220  	debug.AssertNoErr(err)
   221  }
   222  
   223  // deactivate or remove node from the cluster (as per msg.Action)
   224  // called when rebalance is done
   225  func (m *rmdModifier) postRm(nl nl.Listener) {
   226  	var (
   227  		p     = m.p
   228  		tsi   = m.smapCtx.smap.GetNode(m.smapCtx.sid)
   229  		sname = tsi.StringEx()
   230  		xname = "rebalance[" + nl.UUID() + "]"
   231  		smap  = p.owner.smap.get()
   232  		ntsi  = smap.GetNode(m.smapCtx.sid) // with updated flags
   233  		warn  = "remove " + sname + " from the current "
   234  	)
   235  	if ntsi != nil && (ntsi.Flags.IsSet(meta.SnodeMaint) || ntsi.Flags.IsSet(meta.SnodeMaintPostReb)) {
   236  		warn = "mark " + sname + " for maintenance mode in the current "
   237  	}
   238  	warn += smap.StringEx()
   239  	debug.Assert(nl.UUID() == m.rebID && tsi.IsTarget())
   240  
   241  	if nl.ErrCnt() == 0 {
   242  		nlog.Infoln("post-rebalance commit:", warn)
   243  		if _, err := p.rmNodeFinal(m.smapCtx.msg, tsi, m.smapCtx); err != nil {
   244  			nlog.Errorln(err)
   245  		}
   246  		return
   247  	}
   248  
   249  	m.log(nl)
   250  	nlerr := nl.Err()
   251  
   252  	rmd := p.owner.rmd.get()
   253  	if nlerr == cmn.ErrXactRenewAbort || nlerr.Error() == cmn.ErrXactRenewAbort.Error() || m.cur.Version < rmd.Version {
   254  		nlog.Errorf("Warning: %s (%s) got renewed (interrupted) - will not %s (%s)", xname, m.smapCtx.smap, warn, rmd)
   255  		return
   256  	}
   257  	if m.smapCtx.msg.Action != apc.ActRmNodeUnsafe && m.smapCtx.msg.Action != apc.ActDecommissionNode {
   258  		nlog.Errorf("operation %q => %s (%s) failed - will not %s", m.smapCtx.msg.Action, xname, m.smapCtx.smap, warn)
   259  		return
   260  	}
   261  
   262  	// go ahead to decommission anyway
   263  	nlog.Errorf("given %q operation and despite [%v] - proceeding to %s", m.smapCtx.msg.Action, nlerr, warn)
   264  	if _, err := p.rmNodeFinal(m.smapCtx.msg, tsi, m.smapCtx); err != nil {
   265  		nlog.Errorln(err)
   266  	}
   267  
   268  	//
   269  	// TODO: bcast targets to re-rebalance for the same `m.rebID` iff there isn't a new one that's running or about to run
   270  	//
   271  }
   272  
   273  func (m *rmdModifier) log(nl nl.Listener) {
   274  	debug.Assert(nl.UUID() == m.rebID)
   275  	var (
   276  		err  = nl.Err()
   277  		abrt = nl.Aborted()
   278  		name = "rebalance[" + nl.UUID() + "]"
   279  	)
   280  	switch {
   281  	case err == nil && !abrt:
   282  		nlog.InfoDepth(1, name, "done")
   283  	case abrt:
   284  		debug.Assert(err != nil, nl.String()+" - aborted w/ no errors")
   285  		nlog.ErrorDepth(1, name, err)
   286  	default:
   287  		nlog.ErrorDepth(1, name, "failed:", err)
   288  	}
   289  }