github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/metasync.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"net/http"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/NVIDIA/aistore/api/apc"
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/atomic"
    18  	"github.com/NVIDIA/aistore/cmn/cifl"
    19  	"github.com/NVIDIA/aistore/cmn/cos"
    20  	"github.com/NVIDIA/aistore/cmn/debug"
    21  	"github.com/NVIDIA/aistore/cmn/jsp"
    22  	"github.com/NVIDIA/aistore/cmn/nlog"
    23  	"github.com/NVIDIA/aistore/core"
    24  	"github.com/NVIDIA/aistore/core/meta"
    25  	"github.com/NVIDIA/aistore/memsys"
    26  	jsoniter "github.com/json-iterator/go"
    27  )
    28  
    29  // Metasync provides two methods to the rest of the `ais` code:
    30  // * sync - to synchronize cluster-level metadata (the main method)
    31  // * becomeNonPrimary - to be called when the current primary becomes non-primary
    32  //
    33  // All other methods and the metasync's own state are private and internal.
    34  //
    35  // Method `do()` does most of the work (comments inline).
    36  //
    37  // REVS (see interface below) stands for REplicated, Versioned and Shared/Synchronized.
    38  //
    39  // A REVS is an object that represents a certain kind of cluster-wide metadata and
    40  // must be consistently replicated across the entire cluster. To that end, the
    41  // "metasyncer" provides a generic transport to send an arbitrary payload that
    42  // combines any number of data units having the following layout:
    43  //
    44  //         (shared-replicated-object, associated action-message),
    45  //
    46  // where `associated action-message` (aisMsg) provides receivers with the operation
    47  // ("action") and other relevant context.
    48  //
    49  // Further, the metasyncer:
    50  //
    51  // 1) tracks the last synchronized REVS versions
    52  // 2) makes sure all version updates are executed strictly in the non-decremental
    53  //    order
    54  // 3) makes sure that nodes that join the cluster get updated with the current set
    55  //    of REVS objects
    56  // 4) handles failures to update existing nodes, by periodically retrying
    57  //    pending synchronizations (for as long as those members remain in the
    58  //    most recent and current cluster map).
    59  //
    60  // Last but not the least, metasyncer checks that only the currently elected
    61  // leader (aka "primary proxy") distributes the REVS objects, thus providing for
    62  // simple serialization of the versioned updates.
    63  //
    64  // The usage is easy - there is a single sync() method that accepts variable
    65  // number of parameters. Example sync-ing Smap and BMD
    66  // asynchronously:
    67  //
    68  // metasyncer.sync(smapOwner.get(), action1, owner.bmd.get(), action2)
    69  //
    70  // To block until all the replicas get delivered:
    71  //
    72  // wg = metasyncer.sync(...)
    73  // wg.Wait()
    74  //
    75  // On the receiving side, the payload (see above) gets extracted, validated,
    76  // version-compared, and the corresponding Rx handler gets invoked
    77  // with additional information that includes the per-replica action message.
    78  
    79  const (
    80  	revsSmapTag  = "Smap"
    81  	revsRMDTag   = "RMD"
    82  	revsBMDTag   = "BMD"
    83  	revsConfTag  = "Conf"
    84  	revsTokenTag = "token"
    85  	revsEtlMDTag = "EtlMD"
    86  
    87  	revsMaxTags   = 6         // NOTE
    88  	revsActionTag = "-action" // prefix revs tag
    89  )
    90  
    91  const (
    92  	reqSync = iota
    93  	reqNotify
    94  )
    95  
    96  const failsync = "failing to sync"
    97  
    98  type (
    99  	revs interface {
   100  		tag() string         // enum { revsSmapTag, ... }
   101  		version() int64      // the version
   102  		marshal() (b []byte) // marshals the revs
   103  		jit(p *proxy) revs   // current (just-in-time) instance
   104  		sgl() *memsys.SGL    // jsp-encoded SGL
   105  		String() string      // smap.String(), etc.
   106  	}
   107  	revsPair struct {
   108  		revs revs
   109  		msg  *aisMsg
   110  	}
   111  	revsReq struct {
   112  		wg        *sync.WaitGroup
   113  		failedCnt *atomic.Int32
   114  		pairs     []revsPair
   115  		reqType   int // enum: reqSync, etc.
   116  	}
   117  	msPayload map[string][]byte     // tag => revs' body
   118  	ndRevs    map[string]int64      // tag => version (see nodesRevs)
   119  	tagl      map[int64]*memsys.SGL // version => SGL jsp-formatted
   120  
   121  	// main
   122  	metasyncer struct {
   123  		p            *proxy            // parent
   124  		nodesRevs    map[string]ndRevs // cluster-wide node ID => ndRevs sync-ed
   125  		sgls         map[string]tagl   // tag => (version => SGL)
   126  		lastSynced   map[string]revs   // tag => revs last/current sync-ed
   127  		stopCh       chan struct{}     // stop channel
   128  		workCh       chan revsReq      // work channel
   129  		retryTimer   *time.Timer       // timer to sync pending
   130  		timerStopped bool              // true if retryTimer has been stopped, false otherwise
   131  	}
   132  	// metasync Rx structured error
   133  	errMsync struct {
   134  		Message string    `json:"message"`
   135  		Cii     cifl.Info `json:"cii"`
   136  	}
   137  )
   138  
   139  // interface guard
   140  var _ cos.Runner = (*metasyncer)(nil)
   141  
   142  func (req revsReq) isNil() bool { return len(req.pairs) == 0 }
   143  
   144  ////////////////
   145  // metasyncer //
   146  ////////////////
   147  
   148  func (*metasyncer) Name() string { return "metasyncer" }
   149  
   150  func newMetasyncer(p *proxy) (y *metasyncer) {
   151  	y = &metasyncer{p: p}
   152  	y.nodesRevs = make(map[string]ndRevs, 8)
   153  	y.inigls()
   154  	y.lastSynced = make(map[string]revs, revsMaxTags)
   155  
   156  	y.stopCh = make(chan struct{}, 1)
   157  	y.workCh = make(chan revsReq, 32)
   158  
   159  	y.retryTimer = time.NewTimer(time.Hour)
   160  	y.retryTimer.Stop()
   161  	y.timerStopped = true
   162  	return
   163  }
   164  
   165  func (y *metasyncer) Run() error {
   166  	nlog.Infof("Starting %s", y.Name())
   167  	for {
   168  		config := cmn.GCO.Get()
   169  		select {
   170  		case revsReq, ok := <-y.workCh:
   171  			if !ok {
   172  				break
   173  			}
   174  			if revsReq.isNil() { // <== see becomeNonPrimary()
   175  				y.nodesRevs = make(map[string]ndRevs)
   176  				y.free()
   177  				y.lastSynced = make(map[string]revs)
   178  				y.retryTimer.Stop()
   179  				y.timerStopped = true
   180  				break
   181  			}
   182  			failedCnt := y.do(revsReq.pairs, revsReq.reqType)
   183  			if revsReq.wg != nil {
   184  				if revsReq.failedCnt != nil {
   185  					revsReq.failedCnt.Store(int32(failedCnt))
   186  				}
   187  				revsReq.wg.Done()
   188  			}
   189  			if y.timerStopped && failedCnt > 0 {
   190  				y.retryTimer.Reset(config.Periodic.RetrySyncTime.D())
   191  				y.timerStopped = false
   192  			}
   193  			for _, revs := range y.lastSynced {
   194  				y.delold(revs, 1)
   195  			}
   196  		case <-y.retryTimer.C:
   197  			failedCnt := y.handlePending()
   198  			if failedCnt > 0 {
   199  				y.retryTimer.Reset(config.Periodic.RetrySyncTime.D())
   200  				y.timerStopped = false
   201  			} else {
   202  				y.timerStopped = true
   203  			}
   204  		case <-y.stopCh:
   205  			y.retryTimer.Stop()
   206  			return nil
   207  		}
   208  	}
   209  }
   210  
   211  func (y *metasyncer) Stop(err error) {
   212  	nlog.Infof("Stopping %s: %v", y.Name(), err)
   213  
   214  	y.stopCh <- struct{}{}
   215  	close(y.stopCh)
   216  }
   217  
   218  // notify only targets - see bcastTo below
   219  func (y *metasyncer) notify(wait bool, pair revsPair) (failedCnt int) {
   220  	var (
   221  		failedCntAtomic = atomic.NewInt32(0)
   222  		req             = revsReq{pairs: []revsPair{pair}}
   223  	)
   224  	if y.isPrimary() != nil {
   225  		return
   226  	}
   227  	if wait {
   228  		req.wg = &sync.WaitGroup{}
   229  		req.wg.Add(1)
   230  		req.failedCnt = failedCntAtomic
   231  		req.reqType = reqNotify
   232  	}
   233  	y.workCh <- req
   234  
   235  	if wait {
   236  		req.wg.Wait()
   237  		failedCnt = int(failedCntAtomic.Load())
   238  	}
   239  	return
   240  }
   241  
   242  func (y *metasyncer) sync(pairs ...revsPair) *sync.WaitGroup {
   243  	debug.Assert(len(pairs) > 0)
   244  	req := revsReq{pairs: pairs}
   245  	req.wg = &sync.WaitGroup{}
   246  	if err := y.isPrimary(); err != nil {
   247  		nlog.Errorln(err)
   248  		return req.wg
   249  	}
   250  	req.wg.Add(1)
   251  	req.reqType = reqSync
   252  	y.workCh <- req
   253  	return req.wg
   254  }
   255  
   256  // become non-primary (to serialize cleanup of the internal state and stop the timer)
   257  func (y *metasyncer) becomeNonPrimary() {
   258  drain:
   259  	for {
   260  		select {
   261  		case revsReq, ok := <-y.workCh:
   262  			if ok && revsReq.wg != nil {
   263  				revsReq.wg.Done()
   264  			}
   265  		default:
   266  			break drain
   267  		}
   268  	}
   269  	y.workCh <- revsReq{}
   270  	nlog.Infof("%s: becoming non-primary", y.p)
   271  }
   272  
   273  // main method; see top of the file; returns number of "sync" failures
   274  func (y *metasyncer) do(pairs []revsPair, reqT int) (failedCnt int) {
   275  	var (
   276  		refused meta.NodeMap
   277  		newTIDs []string
   278  		method  = http.MethodPut
   279  	)
   280  	if reqT == reqNotify {
   281  		method = http.MethodPost
   282  	}
   283  	if nlog.Stopping() {
   284  		return
   285  	}
   286  
   287  	// step: build payload and update last sync-ed
   288  	payload := make(msPayload, 2*len(pairs))
   289  	for _, pair := range pairs {
   290  		var (
   291  			revsBody []byte
   292  			msg, tag = pair.msg, pair.revs.tag()
   293  			revs     = pair.revs
   294  		)
   295  		if reqT == reqNotify {
   296  			revsBody = revs.marshal()
   297  		} else {
   298  			revs = y.jit(pair)
   299  
   300  			// in an unlikely event, the revs may still carry sgl that has been freed
   301  			// via becomeNonPrimary => y.free() sequence; checking sgl.IsNil() is a compromise
   302  			if sgl := revs.sgl(); sgl != nil && !sgl.IsNil() {
   303  				// fast path
   304  				revsBody = sgl.Bytes()
   305  				y.addnew(revs)
   306  			} else {
   307  				// slow path
   308  				revsBody = revs.marshal()
   309  				if sgl := revs.sgl(); sgl != nil {
   310  					y.addnew(revs)
   311  				}
   312  			}
   313  			y.lastSynced[tag] = revs
   314  		}
   315  		if tag == revsRMDTag {
   316  			md := revs.(*rebMD)
   317  			newTIDs = md.TargetIDs
   318  		}
   319  		payload[tag] = revsBody                           // payload
   320  		payload[tag+revsActionTag] = cos.MustMarshal(msg) // action message always on the wire even when empty
   321  	}
   322  
   323  	// step: bcast
   324  	var (
   325  		urlPath = apc.URLPathMetasync.S
   326  		body    = payload.marshal(y.p.gmm)
   327  		to      = core.AllNodes
   328  		smap    = y.p.owner.smap.get()
   329  	)
   330  	defer body.Free()
   331  
   332  	if reqT == reqNotify {
   333  		to = core.Targets
   334  	}
   335  	args := allocBcArgs()
   336  	args.req = cmn.HreqArgs{Method: method, Path: urlPath, BodyR: body}
   337  	args.smap = smap
   338  	args.timeout = cmn.Rom.MaxKeepalive() // making exception for this critical op
   339  	args.to = to
   340  	args.ignoreMaintenance = true
   341  	results := y.p.bcastGroup(args)
   342  	freeBcArgs(args)
   343  
   344  	// step: count failures and fill-in refused
   345  	for _, res := range results {
   346  		if res.err == nil {
   347  			if reqT == reqSync {
   348  				y.syncDone(res.si, pairs)
   349  			}
   350  			continue
   351  		}
   352  		sname := res.si.StringEx()
   353  		err := res.unwrap()
   354  		// failing to sync - not retrying, ignoring
   355  		if res.si.InMaintOrDecomm() {
   356  			nlog.Infof("%s: %s %s (flags %s): %v(%d)", y.p, failsync, sname, res.si.Fl2S(), err, res.status)
   357  			continue
   358  		}
   359  		// - retrying, counting
   360  		if cos.IsRetriableConnErr(err) || cos.StringInSlice(res.si.ID(), newTIDs) { // always retry newTIDs (joining)
   361  			if refused == nil {
   362  				refused = make(meta.NodeMap, 2)
   363  			}
   364  			refused.Add(res.si)
   365  		} else {
   366  			nlog.Warningf("%s: %s %s: %v(%d)", y.p, failsync, sname, err, res.status)
   367  			failedCnt++
   368  		}
   369  	}
   370  	freeBcastRes(results)
   371  	// step: handle connection-refused right away
   372  	lr := len(refused)
   373  	for range 4 { // retry
   374  		if len(refused) == 0 {
   375  			if lr > 0 {
   376  				nlog.Infof("%s: %d node%s sync-ed", y.p, lr, cos.Plural(lr))
   377  			}
   378  			break
   379  		}
   380  		time.Sleep(cmn.Rom.CplaneOperation())
   381  		smap = y.p.owner.smap.get()
   382  		if !smap.isPrimary(y.p.si) {
   383  			y.becomeNonPrimary()
   384  			return
   385  		}
   386  		if !y.handleRefused(method, urlPath, body, refused, pairs, smap) {
   387  			break
   388  		}
   389  	}
   390  	// step: housekeep and return new pending
   391  	smap = y.p.owner.smap.get()
   392  	for sid := range y.nodesRevs {
   393  		si := smap.GetActiveNode(sid)
   394  		if si == nil {
   395  			delete(y.nodesRevs, sid)
   396  		}
   397  	}
   398  	failedCnt += len(refused)
   399  	return
   400  }
   401  
   402  func (y *metasyncer) jit(pair revsPair) revs {
   403  	var (
   404  		s              string
   405  		revs, msg, tag = pair.revs, pair.msg, pair.revs.tag()
   406  		jitRevs        = revs.jit(y.p)
   407  		skipping       bool
   408  	)
   409  	if jitRevs != nil && jitRevs.version() > revs.version() {
   410  		revs = jitRevs
   411  		skipping = true
   412  	}
   413  	if msg.Action != "" {
   414  		s = ", " + msg.String()
   415  	}
   416  	if skipping {
   417  		nlog.Infof("%s: newer %s v%d%s - skipping %s", y.p, tag, jitRevs.version(), s, revs)
   418  	} else {
   419  		nlog.Infof("%s: %s v%d%s", y.p, tag, revs.version(), s)
   420  	}
   421  	return revs
   422  }
   423  
   424  // keeping track of per-daemon versioning - TODO: extend to take care of aisMsg where pairs may be empty
   425  func (y *metasyncer) syncDone(si *meta.Snode, pairs []revsPair) {
   426  	ndr, ok := y.nodesRevs[si.ID()]
   427  	smap := y.p.owner.smap.get()
   428  	if smap.GetActiveNode(si.ID()) == nil {
   429  		if ok {
   430  			delete(y.nodesRevs, si.ID())
   431  		}
   432  		return
   433  	}
   434  	if !ok {
   435  		ndr = make(map[string]int64, revsMaxTags)
   436  		y.nodesRevs[si.ID()] = ndr
   437  	}
   438  	for _, revsPair := range pairs {
   439  		revs := revsPair.revs
   440  		ndr[revs.tag()] = revs.version()
   441  	}
   442  }
   443  
   444  func (y *metasyncer) handleRefused(method, urlPath string, body io.Reader, refused meta.NodeMap, pairs []revsPair, smap *smapX) (ok bool) {
   445  	args := allocBcArgs()
   446  	args.req = cmn.HreqArgs{Method: method, Path: urlPath, BodyR: body}
   447  	args.network = cmn.NetIntraControl
   448  	args.timeout = cmn.Rom.MaxKeepalive()
   449  	args.nodes = []meta.NodeMap{refused}
   450  	args.nodeCount = len(refused)
   451  	args.smap = smap
   452  	results := y.p.bcastNodes(args)
   453  	freeBcArgs(args)
   454  	for _, res := range results {
   455  		if res.err == nil {
   456  			delete(refused, res.si.ID())
   457  			y.syncDone(res.si, pairs)
   458  			continue
   459  		}
   460  		// failing to sync
   461  		if res.status == http.StatusConflict {
   462  			if e := err2MsyncErr(res.err); e != nil {
   463  				msg := fmt.Sprintf("%s [hr]: %s %s: %s [%v]", y.p.si, failsync, res.si, e.Message, e.Cii)
   464  				if !y.remainPrimary(e, res.si, smap) {
   465  					nlog.Errorln(msg + " - aborting")
   466  					freeBcastRes(results)
   467  					return false
   468  				}
   469  				nlog.Warningln(msg)
   470  				continue
   471  			}
   472  		}
   473  		nlog.Warningf("%s [hr]: %s %s: %v(%d)", y.p, failsync, res.si, res.unwrap(), res.status)
   474  	}
   475  	freeBcastRes(results)
   476  	return true
   477  }
   478  
   479  // pending (map), if requested, contains only those daemons that need
   480  // to get at least one of the most recently sync-ed tag-ed revs
   481  func (y *metasyncer) _pending() (pending meta.NodeMap, smap *smapX) {
   482  	smap = y.p.owner.smap.get()
   483  	if !smap.isPrimary(y.p.si) {
   484  		y.becomeNonPrimary()
   485  		return
   486  	}
   487  	for _, serverMap := range []meta.NodeMap{smap.Tmap, smap.Pmap} {
   488  		for _, si := range serverMap {
   489  			if si.ID() == y.p.SID() {
   490  				continue
   491  			}
   492  			ndr, ok := y.nodesRevs[si.ID()]
   493  			if !ok {
   494  				y.nodesRevs[si.ID()] = make(map[string]int64, revsMaxTags)
   495  			} else {
   496  				inSync := true
   497  				for tag, revs := range y.lastSynced {
   498  					v, ok := ndr[tag]
   499  					if !ok || v < revs.version() {
   500  						inSync = false
   501  						break
   502  					} else if v > revs.version() {
   503  						// skip older versions (TODO: don't skip sending associated aisMsg)
   504  						nlog.Errorf("v: %d; revs.version: %d", v, revs.version())
   505  					}
   506  				}
   507  				if inSync {
   508  					continue
   509  				}
   510  			}
   511  			if pending == nil {
   512  				pending = make(meta.NodeMap, 2)
   513  			}
   514  			pending.Add(si)
   515  		}
   516  	}
   517  	return
   518  }
   519  
   520  // gets invoked when retryTimer fires; returns updated number of still pending
   521  // using MethodPut since reqT here is always reqSync
   522  func (y *metasyncer) handlePending() (failedCnt int) {
   523  	pending, smap := y._pending()
   524  	if len(pending) == 0 {
   525  		nlog.Infof("no pending revs - all good")
   526  		return
   527  	}
   528  	var (
   529  		l       = len(y.lastSynced)
   530  		payload = make(msPayload, 2*l)
   531  		pairs   = make([]revsPair, 0, l)
   532  		msg     = y.p.newAmsgStr("metasync: handle-pending", nil) // NOTE: same msg for all revs
   533  		msgBody = cos.MustMarshal(msg)
   534  	)
   535  	for tag, revs := range y.lastSynced {
   536  		debug.Assert(tag == revs.tag())
   537  		if sgl := revs.sgl(); sgl != nil && !sgl.IsNil() {
   538  			payload[tag] = sgl.Bytes()
   539  		} else {
   540  			payload[tag] = revs.marshal()
   541  			if sgl := revs.sgl(); sgl != nil {
   542  				y.addnew(revs)
   543  			}
   544  		}
   545  		payload[tag+revsActionTag] = msgBody
   546  		pairs = append(pairs, revsPair{revs, msg})
   547  	}
   548  	var (
   549  		urlPath = apc.URLPathMetasync.S
   550  		body    = payload.marshal(y.p.gmm)
   551  		args    = allocBcArgs()
   552  	)
   553  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: urlPath, BodyR: body}
   554  	args.network = cmn.NetIntraControl
   555  	args.timeout = cmn.Rom.MaxKeepalive()
   556  	args.nodes = []meta.NodeMap{pending}
   557  	args.nodeCount = len(pending)
   558  	args.smap = smap
   559  	defer body.Free()
   560  	results := y.p.bcastNodes(args)
   561  	freeBcArgs(args)
   562  	for _, res := range results {
   563  		if res.err == nil {
   564  			y.syncDone(res.si, pairs)
   565  			continue
   566  		}
   567  		failedCnt++
   568  		// failing to sync
   569  		if res.status == http.StatusConflict {
   570  			if e := err2MsyncErr(res.err); e != nil {
   571  				msg := fmt.Sprintf("%s [hp]: %s %s: %s [%v]", y.p.si, failsync, res.si, e.Message, e.Cii)
   572  				if !y.remainPrimary(e, res.si, smap) {
   573  					// return zero so that the caller stops retrying (y.retryTimer)
   574  					nlog.Errorln(msg + " - aborting")
   575  					freeBcastRes(results)
   576  					return 0
   577  				}
   578  				nlog.Warningln(msg)
   579  				continue
   580  			}
   581  		}
   582  		nlog.Warningf("%s [hp]: %s %s: %v(%d)", y.p, failsync, res.si, res.err, res.status)
   583  	}
   584  	freeBcastRes(results)
   585  	return
   586  }
   587  
   588  // cie and isPrimary checks versus remote clusterInfo
   589  func (y *metasyncer) remainPrimary(e *errMsync, from *meta.Snode, smap *smapX) bool /*yes*/ {
   590  	if !cos.IsValidUUID(e.Cii.Smap.UUID) || e.Cii.Smap.Version == 0 {
   591  		return true
   592  	}
   593  	if e.Cii.Smap.UUID != smap.UUID {
   594  		// FATAL: cluster integrity error (cie) - TODO: handle rogue nodes
   595  		cos.ExitLogf("%s: split-brain uuid [%s %s] vs %v from %s", ciError(90), y.p.si, smap.StringEx(),
   596  			e.Cii, from)
   597  	}
   598  	if e.Cii.Smap.Primary.ID == "" || e.Cii.Smap.Primary.ID == y.p.SID() {
   599  		return true
   600  	}
   601  	if e.Cii.Smap.Version > smap.Version {
   602  		nlog.Warningf("%s: detected primary change: %s vs %s [%v] from %s", y.p, smap.StringEx(),
   603  			e.Message, e.Cii, from)
   604  		y.becomeNonPrimary()
   605  		return false
   606  	}
   607  	if e.Cii.Smap.Version < smap.Version {
   608  		return true
   609  	}
   610  	nlog.Errorf("%s: [%s %s] vs %v from %s", ciError(90), y.p, smap.StringEx(), e.Cii, from)
   611  	return true // TODO: iffy; may need to do more
   612  }
   613  
   614  func (y *metasyncer) isPrimary() (err error) {
   615  	smap := y.p.owner.smap.get()
   616  	if smap.isPrimary(y.p.si) {
   617  		return
   618  	}
   619  	err = newErrNotPrimary(y.p.si, smap)
   620  	nlog.Errorln(err)
   621  	return
   622  }
   623  
   624  ////////////
   625  // y.sgls //
   626  ////////////
   627  
   628  func (y *metasyncer) inigls() {
   629  	y.sgls = make(map[string]tagl, revsMaxTags)
   630  	y.sgls[revsSmapTag] = tagl{}
   631  	y.sgls[revsBMDTag] = tagl{}
   632  	y.sgls[revsConfTag] = tagl{}
   633  }
   634  
   635  func (y *metasyncer) free() {
   636  	for _, tagl := range y.sgls {
   637  		for _, sgl := range tagl {
   638  			sgl.Free()
   639  		}
   640  	}
   641  	y.inigls()
   642  }
   643  
   644  func (y *metasyncer) addnew(revs revs) {
   645  	vgl, ok := y.sgls[revs.tag()]
   646  	if !ok {
   647  		vgl = tagl{}
   648  		y.sgls[revs.tag()] = vgl
   649  	}
   650  	if sgl, ok := vgl[revs.version()]; ok {
   651  		if sgl == revs.sgl() {
   652  			return
   653  		}
   654  		// free the duplicate (created previously via "slow path")
   655  		sgl.Free()
   656  	}
   657  	vgl[revs.version()] = revs.sgl()
   658  }
   659  
   660  func (y *metasyncer) delold(revs revs, except int64) {
   661  	vgl, ok := y.sgls[revs.tag()]
   662  	if !ok {
   663  		return
   664  	}
   665  	for v, sgl := range vgl {
   666  		if v < revs.version()-except {
   667  			if !sgl.IsNil() {
   668  				sgl.Free()
   669  			}
   670  			delete(vgl, v)
   671  		}
   672  	}
   673  }
   674  
   675  ///////////////////////////
   676  // metasync jsp encoding //
   677  ///////////////////////////
   678  
   679  var (
   680  	msjspOpts = jsp.Options{Metaver: cmn.MetaverMetasync, Signature: true, Checksum: true}
   681  	msimmSize int64
   682  )
   683  
   684  func (payload msPayload) marshal(mm *memsys.MMSA) (sgl *memsys.SGL) {
   685  	sgl = mm.NewSGL(msimmSize)
   686  	err := jsp.Encode(sgl, payload, msjspOpts)
   687  	cos.AssertNoErr(err)
   688  	msimmSize = max(msimmSize, sgl.Len())
   689  	return sgl
   690  }
   691  
   692  func (payload msPayload) unmarshal(reader io.ReadCloser, tag string) (err error) {
   693  	_, err = jsp.Decode(reader, &payload, msjspOpts, tag)
   694  	return
   695  }
   696  
   697  //////////////
   698  // errMsync //
   699  //////////////
   700  
   701  func (e *errMsync) Error() string { return e.Message }
   702  
   703  func (e *errMsync) message(errs ...error) (retErr error) {
   704  	joinErr := errors.Join(errs...)
   705  	if joinErr == nil {
   706  		return nil
   707  	}
   708  	var (
   709  		u        = joinErr.(interface{ Unwrap() []error })
   710  		filtered = u.Unwrap()
   711  		l        = len(filtered)
   712  	)
   713  	if l == 1 {
   714  		retErr = filtered[0]
   715  		e.Message = retErr.Error()
   716  	} else {
   717  		e.Message = joinErr.Error()
   718  		retErr = fmt.Errorf("%v (and %d more error%s)", filtered[0], l-1, cos.Plural(l-1))
   719  	}
   720  
   721  	nlog.Warningln(cos.MustMarshalToString(e)) // extended info
   722  	return
   723  }
   724  
   725  func err2MsyncErr(err error) (e *errMsync) {
   726  	ee := errMsync{}
   727  	if errP := jsoniter.UnmarshalFromString(err.Error(), &ee); errP == nil {
   728  		return &ee
   729  	}
   730  	return
   731  }