github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/txn.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	ratomic "sync/atomic"
    12  	"time"
    13  
    14  	"github.com/NVIDIA/aistore/api/apc"
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/cos"
    17  	"github.com/NVIDIA/aistore/cmn/debug"
    18  	"github.com/NVIDIA/aistore/cmn/mono"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/core/meta"
    22  	"github.com/NVIDIA/aistore/hk"
    23  	"github.com/NVIDIA/aistore/xact/xs"
    24  )
    25  
    26  // GC
    27  const (
    28  	gcTxnsInterval   = time.Hour
    29  	gcTxnsNumKeep    = 16
    30  	gcTxnsTimeotMult = 10
    31  
    32  	TxnTimeoutMult = 2
    33  )
    34  
    35  type (
    36  	txn interface {
    37  		// accessors
    38  		uuid() string
    39  		started(phase string, tm ...time.Time) time.Time
    40  		isDone() (done bool, err error)
    41  		set(nlps []core.NLP)
    42  		// triggers
    43  		commitAfter(caller string, msg *aisMsg, err error, args ...any) (bool, error)
    44  		rsvp(err error)
    45  		// cleanup
    46  		abort(error)
    47  		unlock()
    48  		// log
    49  		String() string
    50  	}
    51  	rndzvs struct { // rendezvous records
    52  		timestamp  int64
    53  		err        *txnError
    54  		callerName string
    55  	}
    56  	// two maps, two locks
    57  	transactions struct {
    58  		t          *target
    59  		m          map[string]txn // by txn.uuid
    60  		rendezvous struct {
    61  			m   map[string]rndzvs // ditto
    62  			mtx sync.Mutex
    63  		}
    64  		mtx sync.Mutex
    65  	}
    66  	txnError struct { // a wrapper which presence means: "done"
    67  		err error
    68  	}
    69  	txnBase struct { // generic base
    70  		phase struct {
    71  			begin  time.Time
    72  			commit time.Time
    73  		}
    74  		xctn       core.Xact
    75  		err        ratomic.Pointer[txnError]
    76  		action     string
    77  		callerName string
    78  		callerID   string
    79  		uid        string
    80  		smapVer    int64
    81  		bmdVer     int64
    82  		sync.RWMutex
    83  	}
    84  	txnBckBase struct {
    85  		bck  meta.Bck
    86  		nlps []core.NLP
    87  		txnBase
    88  	}
    89  
    90  	//
    91  	// concrete transaction types
    92  	//
    93  	txnCreateBucket struct {
    94  		txnBckBase
    95  	}
    96  	txnMakeNCopies struct {
    97  		txnBckBase
    98  		curCopies int64
    99  		newCopies int64
   100  	}
   101  	txnSetBucketProps struct {
   102  		bprops *cmn.Bprops
   103  		nprops *cmn.Bprops
   104  		txnBckBase
   105  	}
   106  	txnRenameBucket struct {
   107  		bckFrom *meta.Bck
   108  		bckTo   *meta.Bck
   109  		txnBckBase
   110  	}
   111  	txnTCB struct {
   112  		xtcb *xs.XactTCB
   113  		txnBckBase
   114  	}
   115  	txnTCObjs struct {
   116  		xtco *xs.XactTCObjs
   117  		msg  *cmn.TCObjsMsg
   118  		txnBckBase
   119  	}
   120  	txnECEncode struct {
   121  		txnBckBase
   122  	}
   123  	txnArchMultiObj struct {
   124  		xarch *xs.XactArch
   125  		msg   *cmn.ArchiveBckMsg
   126  		txnBckBase
   127  	}
   128  	txnPromote struct {
   129  		msg    *apc.PromoteArgs
   130  		xprm   *xs.XactDirPromote
   131  		dirFQN string
   132  		fqns   []string
   133  		txnBckBase
   134  		totalN int
   135  		fshare bool
   136  	}
   137  )
   138  
   139  // interface guard
   140  var (
   141  	_ txn = (*txnBckBase)(nil)
   142  	_ txn = (*txnCreateBucket)(nil)
   143  	_ txn = (*txnMakeNCopies)(nil)
   144  	_ txn = (*txnSetBucketProps)(nil)
   145  	_ txn = (*txnRenameBucket)(nil)
   146  	_ txn = (*txnTCB)(nil)
   147  	_ txn = (*txnTCObjs)(nil)
   148  	_ txn = (*txnECEncode)(nil)
   149  	_ txn = (*txnPromote)(nil)
   150  )
   151  
   152  //////////////////
   153  // transactions //
   154  //////////////////
   155  
   156  func (txns *transactions) init(t *target) {
   157  	txns.t = t
   158  	txns.m = make(map[string]txn, 8)
   159  	txns.rendezvous.m = make(map[string]rndzvs, 8)
   160  	hk.Reg("txn"+hk.NameSuffix, txns.housekeep, gcTxnsInterval)
   161  }
   162  
   163  func (txns *transactions) begin(txn txn, nlps ...core.NLP) (err error) {
   164  	txns.mtx.Lock()
   165  	if x, ok := txns.m[txn.uuid()]; ok {
   166  		txns.mtx.Unlock()
   167  		for _, nlp := range nlps {
   168  			nlp.Unlock()
   169  		}
   170  		err = fmt.Errorf("%s: %s already exists (duplicate uuid?)", txns.t.si, x)
   171  		debug.AssertNoErr(err)
   172  		return
   173  	}
   174  	txn.started(apc.ActBegin, time.Now())
   175  	txn.set(nlps)
   176  	txns.m[txn.uuid()] = txn
   177  	txns.mtx.Unlock()
   178  
   179  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
   180  		nlog.Infof("%s begin: %s", txns.t, txn)
   181  	}
   182  	return
   183  }
   184  
   185  func (txns *transactions) find(uuid, act string) (txn, error) {
   186  	txns.mtx.Lock()
   187  	txn, ok := txns.m[uuid]
   188  	if !ok {
   189  		// a) not found (benign in an unlikely event of failing to commit)
   190  		txns.mtx.Unlock()
   191  		return nil, cos.NewErrNotFound(txns.t, "txn "+uuid)
   192  	}
   193  
   194  	if act == "" {
   195  		// b) just find & return
   196  		txns.mtx.Unlock()
   197  		return txn, nil
   198  	}
   199  
   200  	// or c) cleanup
   201  	delete(txns.m, uuid)
   202  	txns.mtx.Unlock()
   203  
   204  	txns.rendezvous.mtx.Lock()
   205  	delete(txns.rendezvous.m, uuid)
   206  	txns.rendezvous.mtx.Unlock()
   207  
   208  	if act == apc.ActAbort {
   209  		txn.abort(errors.New("action: abort")) // NOTE: may call txn-specific abort, e.g. TxnAbort
   210  	} else {
   211  		debug.Assert(act == apc.ActCommit || act == ActCleanup, act)
   212  		txn.unlock()
   213  	}
   214  
   215  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
   216  		nlog.Infof("%s %s: %s", txns.t, act, txn)
   217  	}
   218  	return txn, nil
   219  }
   220  
   221  func (txns *transactions) commitBefore(caller string, msg *aisMsg) error {
   222  	var (
   223  		rndzvs rndzvs
   224  		ok     bool
   225  	)
   226  	txns.rendezvous.mtx.Lock()
   227  	if rndzvs, ok = txns.rendezvous.m[msg.UUID]; !ok {
   228  		rndzvs.callerName, rndzvs.timestamp = caller, mono.NanoTime()
   229  		txns.rendezvous.m[msg.UUID] = rndzvs
   230  		txns.rendezvous.mtx.Unlock()
   231  		return nil
   232  	}
   233  	txns.rendezvous.mtx.Unlock()
   234  	return fmt.Errorf("rendezvous record %s:%d already exists", msg.UUID, rndzvs.timestamp)
   235  }
   236  
   237  func (txns *transactions) commitAfter(caller string, msg *aisMsg, err error, args ...any) (errDone error) {
   238  	txns.mtx.Lock()
   239  	txn, ok := txns.m[msg.UUID]
   240  	txns.mtx.Unlock()
   241  
   242  	var running bool
   243  	if ok {
   244  		// Ignore downgrade error.
   245  		if isErrDowngrade(err) {
   246  			err = nil
   247  			bmd := txns.t.owner.bmd.get()
   248  			nlog.Warningf("%s: commit with downgraded (current: %s)", txn, bmd)
   249  		}
   250  		if running, errDone = txn.commitAfter(caller, msg, err, args...); running {
   251  			nlog.Infoln(txn.String())
   252  		}
   253  	}
   254  	if !running {
   255  		txns.rendezvous.mtx.Lock()
   256  		rndzvs, ok := txns.rendezvous.m[msg.UUID]
   257  		if !ok { // can't happen
   258  			txns.rendezvous.mtx.Unlock()
   259  			errDone = cos.NewErrNotFound(txns.t, "rendezvous record "+msg.UUID)
   260  			return
   261  		}
   262  		rndzvs.err = &txnError{err: err}
   263  		txns.rendezvous.m[msg.UUID] = rndzvs
   264  		txns.rendezvous.mtx.Unlock()
   265  	}
   266  	return
   267  }
   268  
   269  // given txn, wait for its completion, handle timeout, and ultimately remove
   270  func (txns *transactions) wait(txn txn, timeoutNetw, timeoutHost time.Duration) (err error) {
   271  	// timestamp
   272  	txn.started(apc.ActCommit, time.Now())
   273  
   274  	// transfer err rendezvous => txn
   275  	txns.rendezvous.mtx.Lock()
   276  	rndzvs, ok := txns.rendezvous.m[txn.uuid()]
   277  	txns.rendezvous.mtx.Unlock()
   278  	if ok && rndzvs.err != nil {
   279  		txn.rsvp(rndzvs.err.err)
   280  	}
   281  
   282  	err = txns._wait(txn, timeoutNetw, timeoutHost)
   283  
   284  	// cleanup or abort, depending on the returned err
   285  	act := apc.ActCommit
   286  	if err != nil {
   287  		act = apc.ActAbort
   288  	}
   289  	txns.find(txn.uuid(), act)
   290  	return err
   291  }
   292  
   293  // poll for 'done'
   294  func (txns *transactions) _wait(txn txn, timeoutNetw, timeoutHost time.Duration) (err error) {
   295  	var (
   296  		sleep       = 100 * time.Millisecond
   297  		done, found bool
   298  	)
   299  	for total := sleep; ; {
   300  		if done, err = txn.isDone(); done {
   301  			return err
   302  		}
   303  		// aborted?
   304  		if _, err = txns.find(txn.uuid(), ""); err != nil {
   305  			return err
   306  		}
   307  
   308  		time.Sleep(sleep)
   309  		total += sleep
   310  		// bump once
   311  		if total == sleep<<4 {
   312  			sleep *= 4
   313  		}
   314  		// must be ready for rendezvous
   315  		if !found {
   316  			txns.rendezvous.mtx.Lock()
   317  			_, found = txns.rendezvous.m[txn.uuid()]
   318  			txns.rendezvous.mtx.Unlock()
   319  		}
   320  		// two timeouts
   321  		if found {
   322  			// config.Timeout.MaxHostBusy (see p.prepTxnClient)
   323  			if timeoutHost != 0 && total > timeoutHost {
   324  				err = errors.New("timed out waiting for txn to complete")
   325  				break
   326  			}
   327  		} else if timeoutNetw != 0 && total > timeoutNetw { // 2 * config.Timeout.MaxKeepalive (see p.prepTxnClient)
   328  			err = errors.New("timed out waiting for commit message")
   329  			break
   330  		}
   331  	}
   332  	return err
   333  }
   334  
   335  // GC orphaned transactions
   336  func (txns *transactions) housekeep() (d time.Duration) {
   337  	var (
   338  		errs    []error
   339  		orphans []txn
   340  		config  = cmn.GCO.Get()
   341  	)
   342  	d = gcTxnsInterval
   343  	txns.mtx.Lock()
   344  	l := len(txns.m)
   345  	if l == 0 {
   346  		txns.mtx.Unlock()
   347  		return
   348  	}
   349  	if l > max(gcTxnsNumKeep*4, 16) {
   350  		d = gcTxnsInterval / 10
   351  	}
   352  	now := time.Now()
   353  	for _, txn := range txns.m {
   354  		err, warn := checkTimeout(txn, now, config)
   355  		if err != nil {
   356  			errs = append(errs, err)
   357  			txn.abort(err)
   358  			delete(txns.m, txn.uuid())
   359  			orphans = append(orphans, txn)
   360  		} else if warn != nil {
   361  			errs = append(errs, warn)
   362  		}
   363  	}
   364  	txns.mtx.Unlock()
   365  
   366  	if len(orphans) > 0 || len(errs) > 0 {
   367  		go txns.cleanup(orphans, errs)
   368  	}
   369  	return
   370  }
   371  
   372  func (txns *transactions) cleanup(orphans []txn, errs []error) {
   373  	if len(orphans) > 0 {
   374  		txns.rendezvous.mtx.Lock()
   375  		for _, txn := range orphans {
   376  			delete(txns.rendezvous.m, txn.uuid())
   377  		}
   378  		txns.rendezvous.mtx.Unlock()
   379  	}
   380  	for _, e := range errs {
   381  		nlog.Errorln(e)
   382  	}
   383  }
   384  
   385  func checkTimeout(txn txn, now time.Time, config *cmn.Config) (err, warn error) {
   386  	elapsed := now.Sub(txn.started(apc.ActBegin))
   387  	if commitTimestamp := txn.started(apc.ActCommit); !commitTimestamp.IsZero() {
   388  		elapsed = now.Sub(commitTimestamp)
   389  		if elapsed > gcTxnsTimeotMult*config.Timeout.MaxHostBusy.D() {
   390  			err = fmt.Errorf("gc %s: [commit - done] timeout", txn)
   391  		} else if elapsed >= TxnTimeoutMult*config.Timeout.MaxHostBusy.D() {
   392  			err = fmt.Errorf("gc %s: commit is taking too long", txn)
   393  		}
   394  	} else {
   395  		if elapsed > TxnTimeoutMult*config.Timeout.MaxHostBusy.D() {
   396  			err = fmt.Errorf("gc %s: [begin - start-commit] timeout", txn)
   397  		} else if elapsed >= TxnTimeoutMult*cmn.Rom.MaxKeepalive() {
   398  			warn = fmt.Errorf("gc %s: commit message is taking too long", txn)
   399  		}
   400  	}
   401  	return
   402  }
   403  
   404  /////////////
   405  // txnBase //
   406  /////////////
   407  
   408  func (txn *txnBase) uuid() string { return txn.uid }
   409  
   410  func (txn *txnBase) started(phase string, tm ...time.Time) (ts time.Time) {
   411  	switch phase {
   412  	case apc.ActBegin:
   413  		if len(tm) > 0 {
   414  			txn.phase.begin = tm[0]
   415  		}
   416  		ts = txn.phase.begin
   417  	case apc.ActCommit:
   418  		if len(tm) > 0 {
   419  			txn.phase.commit = tm[0]
   420  		}
   421  		ts = txn.phase.commit
   422  	default:
   423  		debug.Assert(false)
   424  	}
   425  	return
   426  }
   427  
   428  func (txn *txnBase) isDone() (done bool, err error) {
   429  	if txnErr := txn.err.Load(); txnErr != nil {
   430  		err = txnErr.err
   431  		done = true
   432  	}
   433  	return
   434  }
   435  
   436  func (txn *txnBase) rsvp(err error) { txn.err.Store(&txnError{err: err}) }
   437  
   438  func (txn *txnBase) fillFromCtx(c *txnSrv) {
   439  	txn.uid = c.uuid
   440  	txn.action = c.msg.Action
   441  	txn.callerName = c.callerName
   442  	txn.callerID = c.callerID
   443  	txn.smapVer = c.t.owner.smap.get().version()
   444  	txn.bmdVer = c.t.owner.bmd.get().version()
   445  }
   446  
   447  ////////////////
   448  // txnBckBase //
   449  ////////////////
   450  
   451  func newTxnBckBase(bck *meta.Bck) (txn *txnBckBase) {
   452  	txn = &txnBckBase{}
   453  	txn.init(bck)
   454  	return
   455  }
   456  
   457  func (txn *txnBckBase) init(bck *meta.Bck) { txn.bck = *bck }
   458  
   459  func (txn *txnBckBase) set(nlps []core.NLP) {
   460  	txn.nlps = nlps
   461  }
   462  
   463  func (txn *txnBckBase) unlock() {
   464  	for _, p := range txn.nlps {
   465  		p.Unlock()
   466  	}
   467  	txn.nlps = txn.nlps[:0]
   468  }
   469  
   470  func (txn *txnBckBase) abort(err error) {
   471  	txn.unlock()
   472  	nlog.Infoln(txn.String(), "aborted:", err)
   473  }
   474  
   475  func (txn *txnBckBase) String() string {
   476  	var res, tm string
   477  	if done, err := txn.isDone(); done {
   478  		if err == nil {
   479  			res = " done"
   480  		} else {
   481  			res = fmt.Sprintf(" fail(%v)", err)
   482  		}
   483  	}
   484  	if txn.xctn != nil {
   485  		return fmt.Sprintf("txn-%s%s", txn.xctn, res)
   486  	}
   487  	if !txn.phase.commit.IsZero() {
   488  		tm = "-" + cos.FormatTime(txn.phase.commit, cos.StampMicro)
   489  	}
   490  	return fmt.Sprintf("txn-%s[%s]-%s%s%s]", txn.action, txn.uid, txn.bck.Bucket().String(), tm, res)
   491  }
   492  
   493  func (txn *txnBckBase) commitAfter(caller string, msg *aisMsg, err error, args ...any) (found bool, errDone error) {
   494  	if txn.callerName != caller || msg.UUID != txn.uuid() {
   495  		return
   496  	}
   497  	found = true
   498  	debug.Func(func() {
   499  		bmd, _ := args[0].(*bucketMD)
   500  		debug.Assert(bmd.version() >= txn.bmdVer)
   501  	})
   502  	if txnErr := txn.err.Swap(&txnError{err: err}); txnErr != nil {
   503  		errDone = fmt.Errorf("%s: already done with err=%v (%v)", txn, txnErr.err, err)
   504  		txn.err.Store(txnErr)
   505  	}
   506  	return
   507  }
   508  
   509  /////////////////////
   510  // txnCreateBucket //
   511  /////////////////////
   512  
   513  func newTxnCreateBucket(c *txnSrv) (txn *txnCreateBucket) {
   514  	txn = &txnCreateBucket{}
   515  	txn.init(c.bck)
   516  	txn.fillFromCtx(c)
   517  	return
   518  }
   519  
   520  ////////////////////
   521  // txnMakeNCopies //
   522  ////////////////////
   523  
   524  func newTxnMakeNCopies(c *txnSrv, curCopies, newCopies int64) (txn *txnMakeNCopies) {
   525  	txn = &txnMakeNCopies{curCopies: curCopies, newCopies: newCopies}
   526  	txn.init(c.bck)
   527  	txn.fillFromCtx(c)
   528  	return
   529  }
   530  
   531  func (txn *txnMakeNCopies) String() string {
   532  	s := txn.txnBckBase.String()
   533  	return fmt.Sprintf("%s-copies(%d=>%d)", s, txn.curCopies, txn.newCopies)
   534  }
   535  
   536  ///////////////////////
   537  // txnSetBucketProps //
   538  ///////////////////////
   539  
   540  func newTxnSetBucketProps(c *txnSrv, nprops *cmn.Bprops) (txn *txnSetBucketProps) {
   541  	cos.Assert(c.bck.Props != nil)
   542  	bprops := c.bck.Props.Clone()
   543  	txn = &txnSetBucketProps{bprops: bprops, nprops: nprops}
   544  	txn.init(c.bck)
   545  	txn.fillFromCtx(c)
   546  	return
   547  }
   548  
   549  /////////////////////
   550  // txnRenameBucket //
   551  /////////////////////
   552  
   553  func newTxnRenameBucket(c *txnSrv, bckFrom, bckTo *meta.Bck) (txn *txnRenameBucket) {
   554  	txn = &txnRenameBucket{bckFrom: bckFrom, bckTo: bckTo}
   555  	txn.init(bckFrom)
   556  	txn.fillFromCtx(c)
   557  	return
   558  }
   559  
   560  ////////////
   561  // txnTCB //
   562  ////////////
   563  
   564  func newTxnTCB(c *txnSrv, xtcb *xs.XactTCB) (txn *txnTCB) {
   565  	txn = &txnTCB{xtcb: xtcb}
   566  	txn.init(xtcb.Args().BckFrom)
   567  	txn.fillFromCtx(c)
   568  	return
   569  }
   570  
   571  func (txn *txnTCB) abort(err error) {
   572  	txn.unlock()
   573  	txn.xtcb.TxnAbort(err)
   574  }
   575  
   576  func (txn *txnTCB) String() string {
   577  	txn.xctn = txn.xtcb
   578  	return txn.txnBckBase.String()
   579  }
   580  
   581  ///////////////
   582  // txnTCObjs //
   583  ///////////////
   584  
   585  func newTxnTCObjs(c *txnSrv, bckFrom *meta.Bck, xtco *xs.XactTCObjs, msg *cmn.TCObjsMsg) (txn *txnTCObjs) {
   586  	txn = &txnTCObjs{xtco: xtco, msg: msg}
   587  	txn.init(bckFrom)
   588  	txn.fillFromCtx(c)
   589  	return
   590  }
   591  
   592  func (txn *txnTCObjs) abort(err error) {
   593  	txn.unlock()
   594  	txn.xtco.TxnAbort(err)
   595  }
   596  
   597  func (txn *txnTCObjs) String() string {
   598  	txn.xctn = txn.xtco
   599  	return txn.txnBckBase.String()
   600  }
   601  
   602  /////////////////
   603  // txnECEncode //
   604  /////////////////
   605  
   606  func newTxnECEncode(c *txnSrv, bck *meta.Bck) (txn *txnECEncode) {
   607  	txn = &txnECEncode{}
   608  	txn.init(bck)
   609  	txn.fillFromCtx(c)
   610  	return
   611  }
   612  
   613  ///////////////////////////
   614  // txnCreateArchMultiObj //
   615  ///////////////////////////
   616  
   617  func newTxnArchMultiObj(c *txnSrv, bckFrom *meta.Bck, xarch *xs.XactArch, msg *cmn.ArchiveBckMsg) (txn *txnArchMultiObj) {
   618  	txn = &txnArchMultiObj{xarch: xarch, msg: msg}
   619  	txn.init(bckFrom)
   620  	txn.fillFromCtx(c)
   621  	return
   622  }
   623  
   624  func (txn *txnArchMultiObj) abort(err error) {
   625  	txn.unlock()
   626  	txn.xarch.TxnAbort(err)
   627  }
   628  
   629  func (txn *txnArchMultiObj) String() string {
   630  	txn.xctn = txn.xarch
   631  	return txn.txnBckBase.String()
   632  }
   633  
   634  ////////////////
   635  // txnPromote //
   636  ////////////////
   637  
   638  func newTxnPromote(c *txnSrv, msg *apc.PromoteArgs, fqns []string, dirFQN string, totalN int) (txn *txnPromote) {
   639  	txn = &txnPromote{msg: msg, fqns: fqns, dirFQN: dirFQN, totalN: totalN}
   640  	txn.init(c.bck)
   641  	txn.fillFromCtx(c)
   642  	return
   643  }
   644  
   645  func (txn *txnPromote) String() (s string) {
   646  	txn.xctn = txn.xprm
   647  	return fmt.Sprintf("%s-src(%s)-N(%d)-fshare(%t)", txn.txnBckBase.String(), txn.dirFQN, txn.totalN, txn.fshare)
   648  }