github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/tgttxn.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"net/http"
    11  	"net/url"
    12  	"os"
    13  	"strconv"
    14  	"strings"
    15  	"time"
    16  
    17  	"github.com/NVIDIA/aistore/api/apc"
    18  	"github.com/NVIDIA/aistore/cmn"
    19  	"github.com/NVIDIA/aistore/cmn/archive"
    20  	"github.com/NVIDIA/aistore/cmn/cos"
    21  	"github.com/NVIDIA/aistore/cmn/debug"
    22  	"github.com/NVIDIA/aistore/cmn/feat"
    23  	"github.com/NVIDIA/aistore/cmn/k8s"
    24  	"github.com/NVIDIA/aistore/cmn/nlog"
    25  	"github.com/NVIDIA/aistore/core"
    26  	"github.com/NVIDIA/aistore/core/meta"
    27  	"github.com/NVIDIA/aistore/ext/etl"
    28  	"github.com/NVIDIA/aistore/fs"
    29  	"github.com/NVIDIA/aistore/nl"
    30  	"github.com/NVIDIA/aistore/reb"
    31  	"github.com/NVIDIA/aistore/xact"
    32  	"github.com/NVIDIA/aistore/xact/xreg"
    33  	"github.com/NVIDIA/aistore/xact/xs"
    34  	jsoniter "github.com/json-iterator/go"
    35  )
    36  
    37  const ActCleanup = "cleanup" // in addition to (apc.ActBegin, ...)
    38  
    39  // context structure to gather all (or most) of the relevant state in one place
    40  // (compare with txnCln)
    41  type txnSrv struct {
    42  	t          *target
    43  	msg        *aisMsg
    44  	bck        *meta.Bck // aka bckFrom
    45  	bckTo      *meta.Bck
    46  	query      url.Values
    47  	uuid       string
    48  	phase      string
    49  	callerName string
    50  	callerID   string
    51  	timeout    struct {
    52  		netw time.Duration
    53  		host time.Duration
    54  	}
    55  }
    56  
    57  // TODO: return xaction ID (xid) where applicable
    58  
    59  // verb /v1/txn
    60  func (t *target) txnHandler(w http.ResponseWriter, r *http.Request) {
    61  	var bucket, phase, xid string
    62  	if r.Method != http.MethodPost {
    63  		cmn.WriteErr405(w, r, http.MethodPost)
    64  		return
    65  	}
    66  	msg, err := t.readAisMsg(w, r)
    67  	if err != nil {
    68  		return
    69  	}
    70  
    71  	xactRecord := xact.Table[msg.Action]
    72  	onlyPrimary := xactRecord.Metasync
    73  	if !t.ensureIntraControl(w, r, onlyPrimary) {
    74  		return
    75  	}
    76  
    77  	apiItems, err := t.parseURL(w, r, apc.URLPathTxn.L, 0, true)
    78  	if err != nil {
    79  		return
    80  	}
    81  	switch len(apiItems) {
    82  	case 1: // Global transaction.
    83  		phase = apiItems[0]
    84  	case 2: // Bucket-based transaction.
    85  		bucket, phase = apiItems[0], apiItems[1]
    86  	default:
    87  		t.writeErrURL(w, r)
    88  		return
    89  	}
    90  
    91  	c := &txnSrv{t: t, msg: msg, phase: phase}
    92  	if err := c.init(r, bucket); err != nil {
    93  		t.writeErr(w, r, err)
    94  		return
    95  	}
    96  
    97  	switch msg.Action {
    98  	case apc.ActCreateBck, apc.ActAddRemoteBck:
    99  		err = t.createBucket(c)
   100  	case apc.ActMakeNCopies:
   101  		xid, err = t.makeNCopies(c)
   102  	case apc.ActSetBprops, apc.ActResetBprops:
   103  		xid, err = t.setBprops(c)
   104  	case apc.ActMoveBck:
   105  		xid, err = t.renameBucket(c)
   106  	case apc.ActCopyBck, apc.ActETLBck:
   107  		var (
   108  			dp     core.DP
   109  			tcbmsg = &apc.TCBMsg{}
   110  		)
   111  		if err := cos.MorphMarshal(c.msg.Value, tcbmsg); err != nil {
   112  			t.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, t.si, msg.Action, c.msg.Value, err)
   113  			return
   114  		}
   115  		if msg.Action == apc.ActETLBck {
   116  			var err error
   117  			if dp, err = etlDP(tcbmsg); err != nil {
   118  				t.writeErr(w, r, err)
   119  				return
   120  			}
   121  		}
   122  		xid, err = t.tcb(c, tcbmsg, dp)
   123  	case apc.ActCopyObjects, apc.ActETLObjects:
   124  		var (
   125  			dp     core.DP
   126  			tcomsg = &cmn.TCObjsMsg{}
   127  		)
   128  		if err := cos.MorphMarshal(c.msg.Value, tcomsg); err != nil {
   129  			t.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, t.si, msg.Action, c.msg.Value, err)
   130  			return
   131  		}
   132  		if msg.Action == apc.ActETLObjects {
   133  			cs := fs.Cap()
   134  			if err := cs.Err(); err != nil {
   135  				t.writeErr(w, r, err, http.StatusInsufficientStorage)
   136  				return
   137  			}
   138  			var err error
   139  			if dp, err = etlDP(&tcomsg.TCBMsg); err != nil {
   140  				t.writeErr(w, r, err)
   141  				return
   142  			}
   143  		}
   144  		xid, err = t.tcobjs(c, tcomsg, dp)
   145  	case apc.ActECEncode:
   146  		xid, err = t.ecEncode(c)
   147  	case apc.ActArchive:
   148  		xid, err = t.createArchMultiObj(c)
   149  	case apc.ActStartMaintenance, apc.ActDecommissionNode, apc.ActShutdownNode:
   150  		err = t.beginRm(c)
   151  	case apc.ActDestroyBck, apc.ActEvictRemoteBck:
   152  		err = t.destroyBucket(c)
   153  	case apc.ActPromote:
   154  		hdr := w.Header()
   155  		xid, err = t.promote(c, hdr)
   156  	default:
   157  		t.writeErrAct(w, r, msg.Action)
   158  	}
   159  	if err == nil {
   160  		if xid != "" {
   161  			w.Header().Set(apc.HdrXactionID, xid)
   162  		}
   163  		return
   164  	}
   165  
   166  	// cleanup on error
   167  	t.transactions.find(c.uuid, ActCleanup)
   168  
   169  	if cmn.IsErrCapExceeded(err) {
   170  		cs := t.OOS(nil)
   171  		t.writeErrStatusf(w, r, http.StatusInsufficientStorage, "%s: %v", cs.String(), err)
   172  	} else {
   173  		t.writeErr(w, r, err)
   174  	}
   175  }
   176  
   177  //
   178  // createBucket
   179  //
   180  
   181  func (t *target) createBucket(c *txnSrv) error {
   182  	switch c.phase {
   183  	case apc.ActBegin:
   184  		txn := newTxnCreateBucket(c)
   185  		if err := t.transactions.begin(txn); err != nil {
   186  			return err
   187  		}
   188  		if c.msg.Action == apc.ActCreateBck && c.bck.IsRemote() {
   189  			if c.msg.Value != nil {
   190  				if err := cos.MorphMarshal(c.msg.Value, &c.bck.Props); err != nil {
   191  					return fmt.Errorf(cmn.FmtErrMorphUnmarshal, t, c.msg.Action, c.msg.Value, err)
   192  				}
   193  			}
   194  			if _, err := t.Backend(c.bck).CreateBucket(c.bck); err != nil {
   195  				return err
   196  			}
   197  		}
   198  	case apc.ActAbort:
   199  		t.transactions.find(c.uuid, apc.ActAbort)
   200  	case apc.ActCommit:
   201  		t._commitCreateDestroy(c)
   202  	default:
   203  		debug.Assert(false)
   204  	}
   205  	return nil
   206  }
   207  
   208  func (t *target) _commitCreateDestroy(c *txnSrv) (err error) {
   209  	txn, err := t.transactions.find(c.uuid, "")
   210  	if err != nil {
   211  		return err
   212  	}
   213  	// wait for newBMD w/timeout
   214  	if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   215  		return cmn.NewErrFailedTo(t, "commit", txn, err)
   216  	}
   217  	return
   218  }
   219  
   220  //
   221  // makeNCopies
   222  //
   223  
   224  func (t *target) makeNCopies(c *txnSrv) (string, error) {
   225  	switch c.phase {
   226  	case apc.ActBegin:
   227  		if err := c.bck.Init(t.owner.bmd); err != nil {
   228  			return "", err
   229  		}
   230  		curCopies, newCopies, err := t.validateMakeNCopies(c.bck, c.msg)
   231  		if err != nil {
   232  			return "", err
   233  		}
   234  		cs := fs.Cap()
   235  		if err := cs.Err(); err != nil {
   236  			return "", err
   237  		}
   238  		nlp := newBckNLP(c.bck)
   239  		if !nlp.TryLock(c.timeout.netw / 2) {
   240  			return "", cmn.NewErrBusy("bucket", c.bck.Cname(""))
   241  		}
   242  		txn := newTxnMakeNCopies(c, curCopies, newCopies)
   243  		if err := t.transactions.begin(txn, nlp); err != nil {
   244  			return "", err
   245  		}
   246  	case apc.ActAbort:
   247  		t.transactions.find(c.uuid, apc.ActAbort)
   248  	case apc.ActCommit:
   249  		if err := c.bck.Init(t.owner.bmd); err != nil {
   250  			return "", err
   251  		}
   252  		copies, err := _parseNCopies(c.msg.Value)
   253  		debug.AssertNoErr(err)
   254  		txn, err := t.transactions.find(c.uuid, "")
   255  		if err != nil {
   256  			return "", err
   257  		}
   258  		txnMnc := txn.(*txnMakeNCopies)
   259  		debug.Assert(txnMnc.newCopies == copies)
   260  
   261  		// wait for newBMD w/timeout
   262  		if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   263  			return "", cmn.NewErrFailedTo(t, "commit", txn, err)
   264  		}
   265  
   266  		// do the work in xaction
   267  		rns := xreg.RenewBckMakeNCopies(c.bck, c.uuid, "mnc-actmnc", int(copies))
   268  		if rns.Err != nil {
   269  			return "", fmt.Errorf("%s %s: %v", t, txn, rns.Err)
   270  		}
   271  		xctn := rns.Entry.Get()
   272  		flt := xreg.Flt{Kind: apc.ActPutCopies, Bck: c.bck}
   273  		xreg.DoAbort(flt, errors.New("make-n-copies"))
   274  		c.addNotif(xctn) // notify upon completion
   275  		xact.GoRunW(xctn)
   276  
   277  		return xctn.ID(), nil
   278  	default:
   279  		debug.Assert(false)
   280  	}
   281  	return "", nil
   282  }
   283  
   284  func (t *target) validateMakeNCopies(bck *meta.Bck, msg *aisMsg) (curCopies, newCopies int64, err error) {
   285  	curCopies = bck.Props.Mirror.Copies
   286  	newCopies, err = _parseNCopies(msg.Value)
   287  	if err == nil {
   288  		err = fs.ValidateNCopies(t.si.Name(), int(newCopies))
   289  	}
   290  	// (consider adding "force" option similar to CopyBckMsg.Force)
   291  	if err == nil {
   292  		err = xreg.LimitedCoexistence(t.si, bck, msg.Action)
   293  	}
   294  	if err != nil {
   295  		return
   296  	}
   297  	// don't allow increasing num-copies when used cap is above high wm (let alone OOS)
   298  	if bck.Props.Mirror.Copies < newCopies {
   299  		cs := fs.Cap()
   300  		err = cs.Err()
   301  	}
   302  	return
   303  }
   304  
   305  //
   306  // setBprops
   307  //
   308  
   309  func (t *target) setBprops(c *txnSrv) (string, error) {
   310  	switch c.phase {
   311  	case apc.ActBegin:
   312  		if err := c.bck.Init(t.owner.bmd); err != nil {
   313  			return "", err
   314  		}
   315  		var (
   316  			nprops *cmn.Bprops
   317  			err    error
   318  		)
   319  		if nprops, err = t.validateNprops(c.bck, c.msg); err != nil {
   320  			return "", err
   321  		}
   322  		nlp := newBckNLP(c.bck)
   323  		if !nlp.TryLock(c.timeout.netw / 2) {
   324  			return "", cmn.NewErrBusy("bucket", c.bck.Cname(""))
   325  		}
   326  		txn := newTxnSetBucketProps(c, nprops)
   327  		if err := t.transactions.begin(txn, nlp); err != nil {
   328  			return "", err
   329  		}
   330  	case apc.ActAbort:
   331  		t.transactions.find(c.uuid, apc.ActAbort)
   332  	case apc.ActCommit:
   333  		if err := c.bck.Init(t.owner.bmd); err != nil {
   334  			return "", err
   335  		}
   336  		var xid string
   337  		txn, err := t.transactions.find(c.uuid, "")
   338  		if err != nil {
   339  			return "", err
   340  		}
   341  		txnSetBprops := txn.(*txnSetBucketProps)
   342  		bprops, nprops := txnSetBprops.bprops, txnSetBprops.nprops
   343  		// wait for newBMD w/timeout
   344  		if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   345  			return "", cmn.NewErrFailedTo(t, "commit", txn, err)
   346  		}
   347  		if _reMirror(bprops, nprops) {
   348  			n := int(nprops.Mirror.Copies)
   349  			rns := xreg.RenewBckMakeNCopies(c.bck, c.uuid, "mnc-setprops", n)
   350  			if rns.Err != nil {
   351  				return "", fmt.Errorf("%s %s: %v", t, txn, rns.Err)
   352  			}
   353  			xctn := rns.Entry.Get()
   354  			flt := xreg.Flt{Kind: apc.ActPutCopies, Bck: c.bck}
   355  			xreg.DoAbort(flt, errors.New("re-mirror"))
   356  			c.addNotif(xctn) // notify upon completion
   357  			xact.GoRunW(xctn)
   358  			xid = xctn.ID()
   359  		}
   360  		if _, reec := _reEC(bprops, nprops, c.bck, nil /*smap*/); reec {
   361  			flt := xreg.Flt{Kind: apc.ActECEncode, Bck: c.bck}
   362  			xreg.DoAbort(flt, errors.New("re-ec"))
   363  			rns := xreg.RenewECEncode(c.bck, c.uuid, apc.ActCommit)
   364  			if rns.Err != nil {
   365  				return "", rns.Err
   366  			}
   367  			xctn := rns.Entry.Get()
   368  			c.addNotif(xctn) // ditto
   369  			xact.GoRunW(xctn)
   370  
   371  			if xid == "" {
   372  				xid = xctn.ID()
   373  			} else {
   374  				xid = "" // not supporting multiple..
   375  			}
   376  		}
   377  		return xid, nil
   378  	default:
   379  		debug.Assert(false)
   380  	}
   381  	return "", nil
   382  }
   383  
   384  func (t *target) validateNprops(bck *meta.Bck, msg *aisMsg) (nprops *cmn.Bprops, err error) {
   385  	var (
   386  		body = cos.MustMarshal(msg.Value)
   387  		cs   = fs.Cap()
   388  	)
   389  	nprops = &cmn.Bprops{}
   390  	if err = jsoniter.Unmarshal(body, nprops); err != nil {
   391  		err = fmt.Errorf(cmn.FmtErrUnmarshal, t, "new bucket props", cos.BHead(body), err)
   392  		return
   393  	}
   394  	err = cs.Err()
   395  	if nprops.Mirror.Enabled {
   396  		mpathCount := fs.NumAvail()
   397  		if int(nprops.Mirror.Copies) > mpathCount {
   398  			err = fmt.Errorf(fmtErrInsuffMpaths1, t, mpathCount, bck, nprops.Mirror.Copies)
   399  			return
   400  		}
   401  		if nprops.Mirror.Copies < bck.Props.Mirror.Copies {
   402  			err = nil
   403  		}
   404  	}
   405  	if !nprops.EC.Enabled && bck.Props.EC.Enabled {
   406  		err = nil
   407  	}
   408  	return
   409  }
   410  
   411  //
   412  // renameBucket
   413  //
   414  
   415  func (t *target) renameBucket(c *txnSrv) (string, error) {
   416  	switch c.phase {
   417  	case apc.ActBegin:
   418  		if err := c.bck.Init(t.owner.bmd); err != nil {
   419  			return "", err
   420  		}
   421  		bckFrom, bckTo := c.bck, c.bckTo
   422  		if err := t.validateBckRenTxn(bckFrom, bckTo, c.msg); err != nil {
   423  			return "", err
   424  		}
   425  		nlpFrom := newBckNLP(bckFrom)
   426  		nlpTo := newBckNLP(bckTo)
   427  		if !nlpFrom.TryLock(c.timeout.netw / 4) {
   428  			return "", cmn.NewErrBusy("bucket", bckFrom.Cname(""))
   429  		}
   430  		if !nlpTo.TryLock(c.timeout.netw / 4) {
   431  			nlpFrom.Unlock()
   432  			return "", cmn.NewErrBusy("bucket", bckTo.Cname(""))
   433  		}
   434  		txn := newTxnRenameBucket(c, bckFrom, bckTo)
   435  		if err := t.transactions.begin(txn, nlpFrom, nlpTo); err != nil {
   436  			return "", err
   437  		}
   438  	case apc.ActAbort:
   439  		t.transactions.find(c.uuid, apc.ActAbort)
   440  	case apc.ActCommit:
   441  		if err := c.bck.Init(t.owner.bmd); err != nil {
   442  			return "", err
   443  		}
   444  		txn, err := t.transactions.find(c.uuid, "")
   445  		if err != nil {
   446  			return "", err
   447  		}
   448  		txnRenB := txn.(*txnRenameBucket)
   449  		// wait for newBMD w/timeout
   450  		if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   451  			return "", cmn.NewErrFailedTo(t, "commit", txn, err)
   452  		}
   453  		rns := xreg.RenewBckRename(txnRenB.bckFrom, txnRenB.bckTo, c.uuid, c.msg.RMDVersion, apc.ActCommit)
   454  		if rns.Err != nil {
   455  			nlog.Errorf("%s: %s %v", t, txn, rns.Err)
   456  			return "", rns.Err // must not happen at commit time
   457  		}
   458  		xctn := rns.Entry.Get()
   459  		err = fs.RenameBucketDirs(txnRenB.bckFrom.Bucket(), txnRenB.bckTo.Bucket())
   460  		if err != nil {
   461  			return "", err // ditto
   462  		}
   463  		c.addNotif(xctn) // notify upon completion
   464  
   465  		reb.OnTimedGFN()
   466  		xact.GoRunW(xctn) // run and wait until it starts running
   467  
   468  		return xctn.ID(), nil
   469  	default:
   470  		debug.Assert(false)
   471  	}
   472  	return "", nil
   473  }
   474  
   475  func (t *target) validateBckRenTxn(bckFrom, bckTo *meta.Bck, msg *aisMsg) error {
   476  	cs := fs.Cap()
   477  	if err := cs.Err(); err != nil {
   478  		return err
   479  	}
   480  	if err := xreg.LimitedCoexistence(t.si, bckFrom, msg.Action, bckTo); err != nil {
   481  		return err
   482  	}
   483  	bmd := t.owner.bmd.get()
   484  	if _, present := bmd.Get(bckFrom); !present {
   485  		return cmn.NewErrBckNotFound(bckFrom.Bucket())
   486  	}
   487  	if _, present := bmd.Get(bckTo); present {
   488  		return cmn.NewErrBckAlreadyExists(bckTo.Bucket())
   489  	}
   490  	availablePaths := fs.GetAvail()
   491  	for _, mi := range availablePaths {
   492  		path := mi.MakePathCT(bckTo.Bucket(), fs.ObjectType)
   493  		if err := cos.Stat(path); err != nil {
   494  			if !os.IsNotExist(err) {
   495  				return err
   496  			}
   497  			continue
   498  		}
   499  		if names, empty, err := fs.IsDirEmpty(path); err != nil {
   500  			return err
   501  		} else if !empty {
   502  			return fmt.Errorf("directory %q already exists and is not empty (%v...)", path, names)
   503  		}
   504  	}
   505  	return nil
   506  }
   507  
   508  func etlDP(msg *apc.TCBMsg) (core.DP, error) {
   509  	if !k8s.IsK8s() {
   510  		return nil, k8s.ErrK8sRequired
   511  	}
   512  	if err := msg.Validate(true); err != nil {
   513  		return nil, err
   514  	}
   515  	return etl.NewOfflineDP(msg, cmn.GCO.Get())
   516  }
   517  
   518  // common for both bucket copy and bucket transform - does the heavy lifting
   519  func (t *target) tcb(c *txnSrv, msg *apc.TCBMsg, dp core.DP) (string, error) {
   520  	switch c.phase {
   521  	case apc.ActBegin:
   522  		if err := c.bck.Init(t.owner.bmd); err != nil {
   523  			return "", err
   524  		}
   525  		bckTo, bckFrom := c.bckTo, c.bck
   526  		if err := bckTo.Validate(); err != nil {
   527  			return "", err
   528  		}
   529  		if err := bckFrom.Validate(); err != nil {
   530  			return "", err
   531  		}
   532  		cs := fs.Cap()
   533  		if err := cs.Err(); err != nil {
   534  			return "", err
   535  		}
   536  		if err := xreg.LimitedCoexistence(t.si, bckFrom, c.msg.Action); err != nil {
   537  			if !msg.Force {
   538  				return "", err
   539  			}
   540  			nlog.Errorf("%s: %v - %q is \"forced\", proceeding anyway", t, err, c.msg.Action)
   541  		}
   542  		bmd := t.owner.bmd.get()
   543  		if _, present := bmd.Get(bckFrom); !present {
   544  			return "", cmn.NewErrBckNotFound(bckFrom.Bucket())
   545  		}
   546  		if err := t._tcbBegin(c, msg, dp); err != nil {
   547  			return "", err
   548  		}
   549  	case apc.ActAbort:
   550  		t.transactions.find(c.uuid, apc.ActAbort)
   551  	case apc.ActCommit:
   552  		if err := c.bck.Init(t.owner.bmd); err != nil {
   553  			return "", err
   554  		}
   555  		txn, err := t.transactions.find(c.uuid, "")
   556  		if err != nil {
   557  			return "", err
   558  		}
   559  		txnTcb := txn.(*txnTCB)
   560  
   561  		if c.query.Get(apc.QparamWaitMetasync) != "" {
   562  			if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   563  				txnTcb.xtcb.TxnAbort(err)
   564  				return "", cmn.NewErrFailedTo(t, "commit", txn, err)
   565  			}
   566  		} else {
   567  			t.transactions.find(c.uuid, apc.ActCommit)
   568  		}
   569  
   570  		custom := txnTcb.xtcb.Args()
   571  		if custom.Phase != apc.ActBegin {
   572  			err = fmt.Errorf("%s: %s is already running", t, txnTcb) // never here
   573  			nlog.Errorln(err)
   574  			return "", err
   575  		}
   576  		custom.Phase = apc.ActCommit
   577  		rns := xreg.RenewTCB(c.uuid, c.msg.Action /*kind*/, txnTcb.xtcb.Args())
   578  		if rns.Err != nil {
   579  			if !cmn.IsErrXactUsePrev(rns.Err) {
   580  				txnTcb.xtcb.TxnAbort(rns.Err)
   581  				nlog.Errorf("%s: %s %v", t, txn, rns.Err)
   582  			}
   583  			return "", rns.Err
   584  		}
   585  		xctn := rns.Entry.Get()
   586  		xid := xctn.ID()
   587  		debug.Assert(xid == txnTcb.xtcb.ID())
   588  		c.addNotif(xctn) // notify upon completion
   589  		xact.GoRunW(xctn)
   590  		return xid, nil
   591  	default:
   592  		debug.Assert(false)
   593  	}
   594  	return "", nil
   595  }
   596  
   597  func (t *target) _tcbBegin(c *txnSrv, msg *apc.TCBMsg, dp core.DP) (err error) {
   598  	var (
   599  		bckTo, bckFrom = c.bckTo, c.bck
   600  		nlpFrom        = newBckNLP(bckFrom)
   601  		nlpTo          core.NLP
   602  	)
   603  	if !nlpFrom.TryRLock(c.timeout.netw / 4) {
   604  		return cmn.NewErrBusy("bucket", bckFrom.Cname(""))
   605  	}
   606  	if !msg.DryRun && !bckFrom.Equal(bckTo, true, true) {
   607  		nlpTo = newBckNLP(bckTo)
   608  		if !nlpTo.TryLock(c.timeout.netw / 4) {
   609  			nlpFrom.Unlock()
   610  			return cmn.NewErrBusy("bucket", bckTo.Cname(""))
   611  		}
   612  	}
   613  	custom := &xreg.TCBArgs{Phase: apc.ActBegin, BckFrom: bckFrom, BckTo: bckTo, DP: dp, Msg: msg}
   614  	rns := xreg.RenewTCB(c.uuid, c.msg.Action /*kind*/, custom)
   615  	if err = rns.Err; err != nil {
   616  		nlog.Errorf("%s: %q %+v %v", t, c.uuid, msg, rns.Err)
   617  		return
   618  	}
   619  
   620  	var (
   621  		xctn = rns.Entry.Get()
   622  		xtcb = xctn.(*xs.XactTCB)
   623  		txn  = newTxnTCB(c, xtcb)
   624  		nlps = []core.NLP{nlpFrom}
   625  	)
   626  	if nlpTo != nil {
   627  		nlps = append(nlps, nlpTo)
   628  	}
   629  	return t.transactions.begin(txn, nlps...)
   630  }
   631  
   632  // Two IDs:
   633  // - TxnUUID: transaction (txn) ID
   634  // - xid: xaction ID (will have "tco-" prefix)
   635  func (t *target) tcobjs(c *txnSrv, msg *cmn.TCObjsMsg, dp core.DP) (xid string, _ error) {
   636  	switch c.phase {
   637  	case apc.ActBegin:
   638  		var (
   639  			bckTo   = c.bckTo
   640  			bckFrom = c.bck // from
   641  		)
   642  		if err := c.bck.Init(t.owner.bmd); err != nil {
   643  			return xid, err
   644  		}
   645  		// validate
   646  		if err := bckTo.Validate(); err != nil {
   647  			return xid, err
   648  		}
   649  		if err := bckFrom.Validate(); err != nil {
   650  			return xid, err
   651  		}
   652  		cs := fs.Cap()
   653  		if err := cs.Err(); err != nil {
   654  			return xid, err
   655  		}
   656  		if err := xreg.LimitedCoexistence(t.si, bckFrom, c.msg.Action); err != nil {
   657  			return xid, err
   658  		}
   659  		bmd := t.owner.bmd.get()
   660  		if _, present := bmd.Get(bckFrom); !present {
   661  			return xid, cmn.NewErrBckNotFound(bckFrom.Bucket())
   662  		}
   663  		// begin
   664  		custom := &xreg.TCObjsArgs{BckFrom: bckFrom, BckTo: bckTo, DP: dp}
   665  		rns := xreg.RenewTCObjs(c.msg.Action /*kind*/, custom)
   666  		if rns.Err != nil {
   667  			nlog.Errorf("%s: %q %+v %v", t, c.uuid, c.msg, rns.Err)
   668  			return xid, rns.Err
   669  		}
   670  		xctn := rns.Entry.Get()
   671  		xid = xctn.ID()
   672  
   673  		xtco := xctn.(*xs.XactTCObjs)
   674  
   675  		debug.Assert(msg.TxnUUID == "" || msg.TxnUUID == c.uuid) // (ref050724)
   676  		msg.TxnUUID = c.uuid
   677  		txn := newTxnTCObjs(c, bckFrom, xtco, msg)
   678  		if err := t.transactions.begin(txn); err != nil {
   679  			return xid, err
   680  		}
   681  		xtco.Begin(msg)
   682  	case apc.ActAbort:
   683  		txn, err := t.transactions.find(c.uuid, apc.ActAbort)
   684  		if err == nil {
   685  			txnTco := txn.(*txnTCObjs)
   686  			// if _this_ transaction initiated _that_ on-demand
   687  			if xtco := txnTco.xtco; xtco != nil && xtco.ID() == c.uuid {
   688  				xid = xtco.ID()
   689  				xtco.Abort(nil)
   690  			}
   691  		}
   692  	case apc.ActCommit:
   693  		if err := c.bck.Init(t.owner.bmd); err != nil {
   694  			return xid, err
   695  		}
   696  		txn, err := t.transactions.find(c.uuid, "")
   697  		if err != nil {
   698  			return xid, err
   699  		}
   700  		txnTco := txn.(*txnTCObjs)
   701  		var done bool
   702  		if c.query.Get(apc.QparamWaitMetasync) != "" {
   703  			if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   704  				txnTco.xtco.TxnAbort(err)
   705  				return "", cmn.NewErrFailedTo(t, "commit", txn, err)
   706  			}
   707  			done = true
   708  		}
   709  
   710  		txnTco.xtco.Do(txnTco.msg)
   711  		xid = txnTco.xtco.ID()
   712  		if !done {
   713  			t.transactions.find(c.uuid, apc.ActCommit)
   714  		}
   715  	default:
   716  		debug.Assert(false)
   717  	}
   718  	return xid, nil
   719  }
   720  
   721  //
   722  // ecEncode
   723  //
   724  
   725  func (t *target) ecEncode(c *txnSrv) (string, error) {
   726  	switch c.phase {
   727  	case apc.ActBegin:
   728  		if err := c.bck.Init(t.owner.bmd); err != nil {
   729  			return "", err
   730  		}
   731  		if err := t.validateECEncode(c.bck, c.msg); err != nil {
   732  			return "", err
   733  		}
   734  		cs := fs.Cap()
   735  		if err := cs.Err(); err != nil {
   736  			return "", err
   737  		}
   738  		nlp := newBckNLP(c.bck)
   739  
   740  		if !nlp.TryLock(c.timeout.netw / 4) {
   741  			return "", cmn.NewErrBusy("bucket", c.bck.Cname(""))
   742  		}
   743  		txn := newTxnECEncode(c, c.bck)
   744  		if err := t.transactions.begin(txn, nlp); err != nil {
   745  			return "", err
   746  		}
   747  	case apc.ActAbort:
   748  		t.transactions.find(c.uuid, apc.ActAbort)
   749  	case apc.ActCommit:
   750  		if err := c.bck.Init(t.owner.bmd); err != nil {
   751  			return "", err
   752  		}
   753  		txn, err := t.transactions.find(c.uuid, "")
   754  		if err != nil {
   755  			return "", err
   756  		}
   757  		// wait for newBMD w/timeout
   758  		if err = t.transactions.wait(txn, c.timeout.netw, c.timeout.host); err != nil {
   759  			return "", cmn.NewErrFailedTo(t, "commit", txn, err)
   760  		}
   761  		rns := xreg.RenewECEncode(c.bck, c.uuid, apc.ActCommit)
   762  		if rns.Err != nil {
   763  			nlog.Errorf("%s: %s %v", t, txn, rns.Err)
   764  			return "", rns.Err
   765  		}
   766  		xctn := rns.Entry.Get()
   767  		c.addNotif(xctn) // notify upon completion
   768  		xact.GoRunW(xctn)
   769  
   770  		return xctn.ID(), rns.Err
   771  	default:
   772  		debug.Assert(false)
   773  	}
   774  	return "", nil
   775  }
   776  
   777  func (t *target) validateECEncode(bck *meta.Bck, msg *aisMsg) error {
   778  	cs := fs.Cap()
   779  	if err := cs.Err(); err != nil {
   780  		return err
   781  	}
   782  	return xreg.LimitedCoexistence(t.si, bck, msg.Action)
   783  }
   784  
   785  //
   786  // createArchMultiObj
   787  //
   788  
   789  func (t *target) createArchMultiObj(c *txnSrv) (string /*xaction uuid*/, error) {
   790  	var xid string
   791  	switch c.phase {
   792  	case apc.ActBegin:
   793  		var (
   794  			bckTo   = c.bckTo
   795  			bckFrom = c.bck
   796  		)
   797  		if err := c.bck.Init(t.owner.bmd); err != nil {
   798  			return xid, err
   799  		}
   800  		if err := bckTo.Validate(); err != nil {
   801  			return xid, err
   802  		}
   803  		if !bckFrom.Equal(bckTo, false, false) {
   804  			if err := bckFrom.Validate(); err != nil {
   805  				return xid, err
   806  			}
   807  		}
   808  		archMsg := &cmn.ArchiveBckMsg{}
   809  		if err := cos.MorphMarshal(c.msg.Value, archMsg); err != nil {
   810  			return xid, fmt.Errorf(cmn.FmtErrMorphUnmarshal, t, c.msg.Action, c.msg.Value, err)
   811  		}
   812  		mime, err := archive.Mime(archMsg.Mime, archMsg.ArchName)
   813  		if err != nil {
   814  			return xid, err
   815  		}
   816  		archMsg.Mime = mime // set it for xarch
   817  
   818  		cs := fs.Cap()
   819  		if err := cs.Err(); err != nil {
   820  			return xid, err
   821  		}
   822  
   823  		rns := xreg.RenewPutArchive(bckFrom, bckTo)
   824  		if rns.Err != nil {
   825  			nlog.Errorf("%s: %q %+v %v", t, c.uuid, archMsg, rns.Err)
   826  			return xid, rns.Err
   827  		}
   828  		xctn := rns.Entry.Get()
   829  		xid = xctn.ID()
   830  
   831  		xarch := xctn.(*xs.XactArch)
   832  		// finalize the message and begin local transaction
   833  		archMsg.TxnUUID = c.uuid
   834  		archMsg.FromBckName = bckFrom.Name
   835  		archlom := core.AllocLOM(archMsg.ArchName)
   836  		if err := xarch.Begin(archMsg, archlom); err != nil {
   837  			core.FreeLOM(archlom) // otherwise is freed by x-archive
   838  			return xid, err
   839  		}
   840  		txn := newTxnArchMultiObj(c, bckFrom, xarch, archMsg)
   841  		if err := t.transactions.begin(txn); err != nil {
   842  			return xid, err
   843  		}
   844  	case apc.ActAbort:
   845  		txn, err := t.transactions.find(c.uuid, apc.ActAbort)
   846  		if err == nil {
   847  			txnArch := txn.(*txnArchMultiObj)
   848  			// if _this_ transaction initiated _that_ on-demand
   849  			if xarch := txnArch.xarch; xarch != nil && xarch.ID() == c.uuid {
   850  				xid = xarch.ID()
   851  				xarch.Abort(nil)
   852  			}
   853  		}
   854  	case apc.ActCommit:
   855  		if err := c.bck.Init(t.owner.bmd); err != nil {
   856  			return xid, err
   857  		}
   858  		txn, err := t.transactions.find(c.uuid, "")
   859  		if err != nil {
   860  			return xid, err
   861  		}
   862  		txnArch := txn.(*txnArchMultiObj)
   863  		txnArch.xarch.Do(txnArch.msg)
   864  		xid = txnArch.xarch.ID()
   865  		t.transactions.find(c.uuid, apc.ActCommit)
   866  	}
   867  	return xid, nil
   868  }
   869  
   870  //
   871  // begin (maintenance -- decommission -- shutdown) via p.beginRmTarget
   872  //
   873  
   874  func (t *target) beginRm(c *txnSrv) error {
   875  	var opts apc.ActValRmNode
   876  	if c.phase != apc.ActBegin {
   877  		return fmt.Errorf("%s: expecting begin phase, got %q", t, c.phase)
   878  	}
   879  	if err := cos.MorphMarshal(c.msg.Value, &opts); err != nil {
   880  		return fmt.Errorf(cmn.FmtErrMorphUnmarshal, t, c.msg.Action, c.msg.Value, err)
   881  	}
   882  	return xreg.LimitedCoexistence(t.si, nil, c.msg.Action)
   883  }
   884  
   885  //
   886  // destroy local bucket / evict cloud bucket
   887  //
   888  
   889  func (t *target) destroyBucket(c *txnSrv) error {
   890  	switch c.phase {
   891  	case apc.ActBegin:
   892  		nlp := newBckNLP(c.bck)
   893  		if !nlp.TryLock(c.timeout.netw / 2) {
   894  			return cmn.NewErrBusy("bucket", c.bck.Cname(""))
   895  		}
   896  		txn := newTxnBckBase(c.bck)
   897  		txn.fillFromCtx(c)
   898  		if err := t.transactions.begin(txn, nlp); err != nil {
   899  			return err
   900  		}
   901  	case apc.ActAbort:
   902  		t.transactions.find(c.uuid, apc.ActAbort)
   903  	case apc.ActCommit:
   904  		t._commitCreateDestroy(c)
   905  	default:
   906  		debug.Assert(false)
   907  	}
   908  	return nil
   909  }
   910  
   911  func (t *target) promote(c *txnSrv, hdr http.Header) (string, error) {
   912  	switch c.phase {
   913  	case apc.ActBegin:
   914  		if err := c.bck.Init(t.owner.bmd); err != nil {
   915  			return "", err
   916  		}
   917  		cs := fs.Cap()
   918  		if err := cs.Err(); err != nil {
   919  			return "", err
   920  		}
   921  		prmMsg := &apc.PromoteArgs{}
   922  		if err := cos.MorphMarshal(c.msg.Value, prmMsg); err != nil {
   923  			err = fmt.Errorf(cmn.FmtErrMorphUnmarshal, t, c.msg.Action, c.msg.Value, err)
   924  			return "", err
   925  		}
   926  		if strings.Contains(prmMsg.ObjName, "../") || strings.Contains(prmMsg.ObjName, "~/") {
   927  			return "", fmt.Errorf("invalid object name or prefix %q", prmMsg.ObjName)
   928  		}
   929  		srcFQN := c.msg.Name
   930  		finfo, err := os.Stat(srcFQN)
   931  		if err != nil {
   932  			return "", err
   933  		}
   934  		if !finfo.IsDir() {
   935  			txn := newTxnPromote(c, prmMsg, []string{srcFQN}, "" /*dirFQN*/, 1)
   936  			if err := t.transactions.begin(txn); err != nil {
   937  				return "", err
   938  			}
   939  			hdr.Set(apc.HdrPromoteNamesNum, "1")
   940  			return "", nil
   941  		}
   942  
   943  		// directory
   944  		fqns, totalN, cksumVal, err := prmScan(srcFQN, prmMsg)
   945  		if totalN == 0 {
   946  			if err != nil {
   947  				return "", err
   948  			}
   949  			return "", fmt.Errorf("%s: directory %q is empty", t, srcFQN)
   950  		}
   951  		txn := newTxnPromote(c, prmMsg, fqns, srcFQN /*dir*/, totalN)
   952  		if err := t.transactions.begin(txn); err != nil {
   953  			return "", err
   954  		}
   955  		hdr.Set(apc.HdrPromoteNamesHash, cksumVal)
   956  		hdr.Set(apc.HdrPromoteNamesNum, strconv.Itoa(totalN))
   957  	case apc.ActAbort:
   958  		t.transactions.find(c.uuid, apc.ActAbort)
   959  	case apc.ActCommit:
   960  		if err := c.bck.Init(t.owner.bmd); err != nil {
   961  			return "", err
   962  		}
   963  		txn, err := t.transactions.find(c.uuid, "")
   964  		if err != nil {
   965  			return "", err
   966  		}
   967  		txnPrm, ok := txn.(*txnPromote)
   968  		debug.Assert(ok)
   969  		defer t.transactions.find(c.uuid, apc.ActCommit)
   970  
   971  		if txnPrm.totalN == 0 {
   972  			nlog.Infof("%s: nothing to do (%s)", t, txnPrm)
   973  			return "", nil
   974  		}
   975  		// set by controlling proxy upon collecting and comparing all the begin-phase results
   976  		txnPrm.fshare = c.query.Get(apc.QparamConfirmFshare) != ""
   977  
   978  		// promote synchronously wo/ xaction;
   979  		// (set by proxy to eliminate any ambiguity vis-a-vis `promoteNumSync` special)
   980  		if noXact := c.query.Get(apc.QparamActNoXact) != ""; noXact {
   981  			nlog.Infof("%s: promote synchronously %s", t, txnPrm)
   982  			err := t.prmNumFiles(c, txnPrm, txnPrm.fshare)
   983  			return "", err
   984  		}
   985  
   986  		rns := xreg.RenewPromote(c.uuid, c.bck, txnPrm.msg)
   987  		if rns.Err != nil {
   988  			nlog.Errorf("%s: %s %v", t, txnPrm, rns.Err)
   989  			return "", rns.Err
   990  		}
   991  		xprm := rns.Entry.Get().(*xs.XactDirPromote)
   992  		xprm.SetFshare(txnPrm.fshare)
   993  		txnPrm.xprm = xprm
   994  
   995  		c.addNotif(xprm) // upon completion
   996  		xact.GoRunW(xprm)
   997  		return xprm.ID(), nil
   998  	default:
   999  		debug.Assert(false)
  1000  	}
  1001  	return "", nil
  1002  }
  1003  
  1004  // scan and, optionally, auto-detect file-share
  1005  func prmScan(dirFQN string, prmMsg *apc.PromoteArgs) (fqns []string, totalN int, cksumVal string, err error) {
  1006  	var (
  1007  		cksum      *cos.CksumHash
  1008  		autoDetect = !prmMsg.SrcIsNotFshare || !cmn.Rom.Features().IsSet(feat.DontAutoDetectFshare)
  1009  	)
  1010  	cb := func(fqn string, de fs.DirEntry) (err error) {
  1011  		if de.IsDir() {
  1012  			return
  1013  		}
  1014  		if len(fqns) == 0 {
  1015  			fqns = make([]string, 0, promoteNumSync)
  1016  		}
  1017  		if len(fqns) < promoteNumSync {
  1018  			fqns = append(fqns, fqn)
  1019  		}
  1020  		totalN++
  1021  		if autoDetect {
  1022  			cksum.H.Write([]byte(fqn))
  1023  		}
  1024  		return
  1025  	}
  1026  	if autoDetect {
  1027  		cksum = cos.NewCksumHash(cos.ChecksumXXHash)
  1028  	}
  1029  	if prmMsg.Recursive {
  1030  		opts := &fs.WalkOpts{Dir: dirFQN, Callback: cb, Sorted: true}
  1031  		err = fs.Walk(opts)
  1032  	} else {
  1033  		err = fs.WalkDir(dirFQN, cb)
  1034  	}
  1035  
  1036  	if err != nil || totalN == 0 || !autoDetect {
  1037  		return
  1038  	}
  1039  	cksum.Finalize()
  1040  	cksumVal = cksum.Value()
  1041  	return
  1042  }
  1043  
  1044  // synchronously wo/ xaction
  1045  func (t *target) prmNumFiles(c *txnSrv, txnPrm *txnPromote, confirmedFshare bool) error {
  1046  	smap := t.owner.smap.Get()
  1047  	config := cmn.GCO.Get()
  1048  	for _, fqn := range txnPrm.fqns {
  1049  		objName, err := xs.PrmObjName(fqn, txnPrm.dirFQN, txnPrm.msg.ObjName)
  1050  		if err != nil {
  1051  			return err
  1052  		}
  1053  		// file share == true: promote only the part of the txnPrm.fqns that "lands" locally
  1054  		if confirmedFshare {
  1055  			si, err := smap.HrwName2T(c.bck.MakeUname(objName))
  1056  			if err != nil {
  1057  				return err
  1058  			}
  1059  			if si.ID() != t.SID() {
  1060  				continue
  1061  			}
  1062  		}
  1063  		params := core.PromoteParams{
  1064  			Bck:    c.bck,
  1065  			Config: config,
  1066  			PromoteArgs: apc.PromoteArgs{
  1067  				SrcFQN:       fqn,
  1068  				ObjName:      objName,
  1069  				OverwriteDst: txnPrm.msg.OverwriteDst,
  1070  				DeleteSrc:    txnPrm.msg.DeleteSrc,
  1071  			},
  1072  		}
  1073  		if _, err := t.Promote(&params); err != nil {
  1074  			return err
  1075  		}
  1076  	}
  1077  	return nil
  1078  }
  1079  
  1080  ////////////
  1081  // txnSrv //
  1082  ////////////
  1083  
  1084  func (c *txnSrv) init(r *http.Request, bucket string) (err error) {
  1085  	c.callerName = r.Header.Get(apc.HdrCallerName)
  1086  	c.callerID = r.Header.Get(apc.HdrCallerID)
  1087  
  1088  	query := r.URL.Query()
  1089  	if bucket != "" {
  1090  		if c.bck, err = newBckFromQ(bucket, query, nil); err != nil {
  1091  			return err
  1092  		}
  1093  	}
  1094  	c.bckTo, err = newBckFromQuname(query, false /*required*/)
  1095  	if err != nil {
  1096  		return err
  1097  	}
  1098  
  1099  	// latency = (network) +- (clock drift)
  1100  	if c.phase == apc.ActBegin {
  1101  		if ptime := query.Get(apc.QparamUnixTime); ptime != "" {
  1102  			now := time.Now().UnixNano()
  1103  			dur := ptLatency(now, ptime, r.Header.Get(apc.HdrCallerIsPrimary))
  1104  			lim := int64(cmn.Rom.CplaneOperation()) >> 1
  1105  			if dur > lim || dur < -lim {
  1106  				nlog.Errorf("Warning: clock drift %s <-> %s(self) = %v, txn %s[%s]",
  1107  					c.callerName, c.t, time.Duration(dur), c.msg.Action, c.msg.UUID)
  1108  			}
  1109  		}
  1110  	}
  1111  
  1112  	c.uuid = c.msg.UUID
  1113  	if c.uuid == "" {
  1114  		return nil
  1115  	}
  1116  	if tout := query.Get(apc.QparamNetwTimeout); tout != "" {
  1117  		c.timeout.netw, err = cos.S2Duration(tout)
  1118  		debug.AssertNoErr(err)
  1119  	}
  1120  	if tout := query.Get(apc.QparamHostTimeout); tout != "" {
  1121  		c.timeout.host, err = cos.S2Duration(tout)
  1122  		debug.AssertNoErr(err)
  1123  	}
  1124  	c.query = query // operation-specific values, if any
  1125  	return err
  1126  }
  1127  
  1128  func (c *txnSrv) addNotif(xctn core.Xact) {
  1129  	dsts, ok := c.query[apc.QparamNotifyMe]
  1130  	if !ok {
  1131  		return
  1132  	}
  1133  	xctn.AddNotif(&xact.NotifXact{
  1134  		Base: nl.Base{When: core.UponTerm, Dsts: dsts, F: c.t.notifyTerm},
  1135  		Xact: xctn,
  1136  	})
  1137  }