github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/xact/base.go (about)

     1  // Package xact provides core functionality for the AIStore eXtended Actions (xactions).
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package xact
     6  
     7  import (
     8  	"fmt"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	ratomic "sync/atomic"
    13  	"time"
    14  
    15  	"github.com/NVIDIA/aistore/api/apc"
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/atomic"
    18  	"github.com/NVIDIA/aistore/cmn/cos"
    19  	"github.com/NVIDIA/aistore/cmn/debug"
    20  	"github.com/NVIDIA/aistore/cmn/nlog"
    21  	"github.com/NVIDIA/aistore/core"
    22  	"github.com/NVIDIA/aistore/core/meta"
    23  	"github.com/NVIDIA/aistore/fs"
    24  	"github.com/NVIDIA/aistore/nl"
    25  )
    26  
    27  type (
    28  	Base struct {
    29  		notif  *NotifXact
    30  		bck    meta.Bck
    31  		id     string
    32  		kind   string
    33  		_nam   string
    34  		sutime atomic.Int64
    35  		eutime atomic.Int64
    36  		abort  struct {
    37  			ch   chan error
    38  			err  ratomic.Pointer[error]
    39  			done atomic.Bool
    40  		}
    41  		stats struct {
    42  			objs     atomic.Int64 // locally processed
    43  			bytes    atomic.Int64
    44  			outobjs  atomic.Int64 // transmit
    45  			outbytes atomic.Int64
    46  			inobjs   atomic.Int64 // receive
    47  			inbytes  atomic.Int64
    48  		}
    49  		err cos.Errs
    50  	}
    51  	Marked struct {
    52  		Xact        core.Xact
    53  		Interrupted bool // (rebalance | resilver) interrupted
    54  		Restarted   bool // node restarted
    55  	}
    56  )
    57  
    58  var IncFinished func()
    59  
    60  // common helper to go-run and wait until it actually starts running
    61  func GoRunW(xctn core.Xact) {
    62  	wg := &sync.WaitGroup{}
    63  	wg.Add(1)
    64  	go xctn.Run(wg)
    65  	wg.Wait()
    66  }
    67  
    68  func IsValidUUID(id string) bool { return cos.IsValidUUID(id) || IsValidRebID(id) }
    69  
    70  //////////////
    71  // Base - partially implements `core.Xact` interface
    72  //////////////
    73  
    74  func (xctn *Base) InitBase(id, kind string, bck *meta.Bck) {
    75  	debug.Assert(kind == apc.ActETLInline || cos.IsValidUUID(id) || IsValidRebID(id), id)
    76  	debug.Assert(IsValidKind(kind), kind)
    77  	xctn.id, xctn.kind = id, kind
    78  	xctn.abort.ch = make(chan error, 1)
    79  	if bck != nil {
    80  		xctn.bck = *bck
    81  	}
    82  	xctn.setStartTime(time.Now())
    83  
    84  	// name never changes
    85  	xctn._nam = "x-" + xctn.Kind() + LeftID + xctn.ID() + RightID
    86  	if !xctn.bck.IsEmpty() {
    87  		xctn._nam += "-" + xctn.bck.Cname("")
    88  	}
    89  }
    90  
    91  func (xctn *Base) ID() string   { return xctn.id }
    92  func (xctn *Base) Kind() string { return xctn.kind }
    93  
    94  func (xctn *Base) Bck() *meta.Bck { return &xctn.bck }
    95  
    96  func (xctn *Base) Finished() bool { return xctn.eutime.Load() != 0 }
    97  
    98  func (xctn *Base) Running() (yes bool) {
    99  	yes = xctn.sutime.Load() != 0 && !xctn.Finished() && !xctn.IsAborted()
   100  	debug.Assert(!yes || xctn.ID() != "", xctn.String())
   101  	return
   102  }
   103  
   104  func (xctn *Base) IsIdle() bool { return !xctn.Running() }
   105  
   106  func (*Base) FromTo() (*meta.Bck, *meta.Bck) { return nil, nil }
   107  
   108  //
   109  // aborting
   110  //
   111  
   112  func (xctn *Base) ChanAbort() <-chan error { return xctn.abort.ch }
   113  
   114  func (xctn *Base) IsAborted() bool { return xctn.abort.done.Load() }
   115  
   116  func (xctn *Base) AbortErr() error {
   117  	if !xctn.IsAborted() {
   118  		return nil
   119  	}
   120  	// (is aborted)
   121  	// normally, is expected to return `abort.err` without any sleep
   122  	// but may also poll up to 4 times for 1s total
   123  	const wait = time.Second
   124  	sleep := cos.ProbingFrequency(wait)
   125  	for elapsed := time.Duration(0); elapsed < wait; elapsed += sleep {
   126  		perr := xctn.abort.err.Load()
   127  		if perr != nil {
   128  			return *perr
   129  		}
   130  		time.Sleep(sleep)
   131  	}
   132  	return cmn.NewErrAborted(xctn.Name(), "base.abort-err.timeout", nil)
   133  }
   134  
   135  func (xctn *Base) AbortedAfter(d time.Duration) (err error) {
   136  	sleep := cos.ProbingFrequency(d)
   137  	for elapsed := time.Duration(0); elapsed < d; elapsed += sleep {
   138  		if err = xctn.AbortErr(); err != nil {
   139  			break
   140  		}
   141  		time.Sleep(sleep)
   142  	}
   143  	return
   144  }
   145  
   146  func (xctn *Base) Abort(err error) bool {
   147  	if xctn.Finished() || !xctn.abort.done.CAS(false, true) {
   148  		return false
   149  	}
   150  
   151  	if err == nil {
   152  		err = cmn.ErrXactUserAbort // NOTE: only user can cause no-errors abort
   153  	} else if errAborted := cmn.AsErrAborted(err); errAborted != nil {
   154  		if errCause := errAborted.Unwrap(); errCause != nil {
   155  			err = errCause
   156  		}
   157  	}
   158  	perr := xctn.abort.err.Swap(&err)
   159  	debug.Assert(perr == nil, xctn.String())
   160  	debug.Assert(len(xctn.abort.ch) == 0, xctn.String()) // CAS above
   161  
   162  	xctn.abort.ch <- err
   163  	close(xctn.abort.ch)
   164  
   165  	if xctn.Kind() != apc.ActList {
   166  		nlog.InfoDepth(1, xctn.Name(), err)
   167  	}
   168  	return true
   169  }
   170  
   171  //
   172  // multi-error
   173  //
   174  
   175  func (xctn *Base) AddErr(err error, logExtra ...int) {
   176  	if xctn.IsAborted() { // no more errors once aborted
   177  		return
   178  	}
   179  	debug.Assert(err != nil)
   180  	fs.CleanPathErr(err)
   181  	xctn.err.Add(err)
   182  	// just add
   183  	if len(logExtra) == 0 {
   184  		return
   185  	}
   186  	// log error
   187  	level := logExtra[0]
   188  	if level == 0 {
   189  		nlog.ErrorDepth(1, err)
   190  		return
   191  	}
   192  	// finally, FastV
   193  	module := logExtra[1]
   194  	if cmn.Rom.FastV(level, module) {
   195  		nlog.InfoDepth(1, "Warning:", err)
   196  	}
   197  }
   198  
   199  func (xctn *Base) Err() error {
   200  	if xctn.ErrCnt() == 0 {
   201  		return nil
   202  	}
   203  	return &xctn.err
   204  }
   205  
   206  func (xctn *Base) JoinErr() (int, error) { return xctn.err.JoinErr() }
   207  func (xctn *Base) ErrCnt() int           { return xctn.err.Cnt() }
   208  
   209  // count all the way to duration; reset and adjust every time activity is detected
   210  func (xctn *Base) Quiesce(d time.Duration, cb core.QuiCB) core.QuiRes {
   211  	var (
   212  		idle, total time.Duration
   213  		sleep       = cos.ProbingFrequency(d)
   214  		dur         = d
   215  	)
   216  	if xctn.IsAborted() {
   217  		return core.QuiAborted
   218  	}
   219  	for idle < dur {
   220  		time.Sleep(sleep)
   221  		if xctn.IsAborted() {
   222  			return core.QuiAborted
   223  		}
   224  		total += sleep
   225  		switch res := cb(total); res {
   226  		case core.QuiInactiveCB: // NOTE: used by callbacks, converts to one of the returned codes
   227  			idle += sleep
   228  		case core.QuiActive:
   229  			idle = 0                  // reset
   230  			dur = min(dur+sleep, 2*d) // bump up to 2x initial
   231  		case core.QuiActiveRet:
   232  			return core.QuiActiveRet
   233  		case core.QuiDone:
   234  			return core.QuiDone
   235  		case core.QuiTimeout:
   236  			return core.QuiTimeout
   237  		}
   238  	}
   239  	return core.Quiescent
   240  }
   241  
   242  func (xctn *Base) Cname() string { return Cname(xctn.Kind(), xctn.ID()) }
   243  
   244  func (xctn *Base) Name() (s string) { return xctn._nam }
   245  
   246  func (xctn *Base) _sb() (sb strings.Builder) {
   247  	sb.WriteString(xctn._nam)
   248  	sb.WriteByte('-')
   249  	sb.WriteString(cos.FormatTime(xctn.StartTime(), cos.StampMicro))
   250  
   251  	if !xctn.Finished() { // ok to (rarely) miss _aborted_ state as this is purely informational
   252  		return sb
   253  	}
   254  	etime := cos.FormatTime(xctn.EndTime(), cos.StampMicro)
   255  	if xctn.IsAborted() {
   256  		sb.WriteString(fmt.Sprintf("-[abrt: %v]", xctn.AbortErr()))
   257  	}
   258  	sb.WriteByte('-')
   259  	sb.WriteString(etime)
   260  	return sb
   261  }
   262  
   263  func (xctn *Base) String() string {
   264  	sb := xctn._sb()
   265  	return sb.String()
   266  }
   267  
   268  func (xctn *Base) StartTime() time.Time {
   269  	u := xctn.sutime.Load()
   270  	if u != 0 {
   271  		return time.Unix(0, u)
   272  	}
   273  	return time.Time{}
   274  }
   275  
   276  func (xctn *Base) setStartTime(s time.Time) { xctn.sutime.Store(s.UnixNano()) }
   277  
   278  func (xctn *Base) EndTime() time.Time {
   279  	u := xctn.eutime.Load()
   280  	if u != 0 {
   281  		return time.Unix(0, u)
   282  	}
   283  	return time.Time{}
   284  }
   285  
   286  // upon completion, all xactions optionally notify listener(s) and refresh local capacity stats
   287  func (xctn *Base) onFinished(err error, aborted bool) {
   288  	// notifications
   289  	if xctn.notif != nil {
   290  		nl.OnFinished(xctn.notif, err, aborted)
   291  	}
   292  	xactRecord := Table[xctn.kind]
   293  	if xactRecord.RefreshCap {
   294  		// currently, ignoring returned err-cap and not calling t.OOS()
   295  		// both (conditions) handled by periodic stats
   296  		fs.CapRefresh(nil /*config*/, nil /*tcdf*/)
   297  	}
   298  
   299  	IncFinished() // in re: HK cleanup long-time finished
   300  }
   301  
   302  func (xctn *Base) AddNotif(n core.Notif) {
   303  	xctn.notif = n.(*NotifXact)
   304  	debug.Assert(xctn.notif.Xact != nil && xctn.notif.F != nil)     // always fin-notif and points to self
   305  	debug.Assert(!n.Upon(core.UponProgress) || xctn.notif.P != nil) // progress notification is optional
   306  }
   307  
   308  // atomically set end-time
   309  func (xctn *Base) Finish() {
   310  	var (
   311  		err     error
   312  		info    string
   313  		aborted bool
   314  	)
   315  	if !xctn.eutime.CAS(0, 1) {
   316  		return
   317  	}
   318  	xctn.eutime.Store(time.Now().UnixNano())
   319  	if aborted = xctn.IsAborted(); aborted {
   320  		if perr := xctn.abort.err.Load(); perr != nil {
   321  			err = *perr
   322  		}
   323  	}
   324  	if xctn.ErrCnt() > 0 {
   325  		if err == nil {
   326  			debug.Assert(!aborted)
   327  			err = xctn.Err()
   328  		} else {
   329  			// abort takes precedence
   330  			info = "(" + xctn.Err().Error() + ")"
   331  		}
   332  	}
   333  	xctn.onFinished(err, aborted)
   334  	// log
   335  	switch {
   336  	case xctn.Kind() == apc.ActList:
   337  	case err == nil:
   338  		nlog.Infoln(xctn.String(), "finished")
   339  	case aborted:
   340  		nlog.Warningln(xctn.String(), "aborted:", err.Error(), info)
   341  	default:
   342  		nlog.Infoln("Warning:", xctn.String(), "finished w/err:", err.Error())
   343  	}
   344  }
   345  
   346  // base stats: locally processed
   347  func (xctn *Base) Objs() int64  { return xctn.stats.objs.Load() }
   348  func (xctn *Base) Bytes() int64 { return xctn.stats.bytes.Load() }
   349  
   350  func (xctn *Base) ObjsAdd(cnt int, size int64) {
   351  	xctn.stats.objs.Add(int64(cnt))
   352  	xctn.stats.bytes.Add(size)
   353  }
   354  
   355  // oft. used
   356  func (xctn *Base) LomAdd(lom *core.LOM) { xctn.ObjsAdd(1, lom.SizeBytes(true)) }
   357  
   358  // base stats: transmit
   359  func (xctn *Base) OutObjs() int64  { return xctn.stats.outobjs.Load() }
   360  func (xctn *Base) OutBytes() int64 { return xctn.stats.outbytes.Load() }
   361  
   362  func (xctn *Base) OutObjsAdd(cnt int, size int64) {
   363  	xctn.stats.outobjs.Add(int64(cnt))
   364  	if size > 0 { // not unsized
   365  		xctn.stats.outbytes.Add(size)
   366  	}
   367  }
   368  
   369  // base stats: receive
   370  func (xctn *Base) InObjs() int64  { return xctn.stats.inobjs.Load() }
   371  func (xctn *Base) InBytes() int64 { return xctn.stats.inbytes.Load() }
   372  
   373  func (xctn *Base) InObjsAdd(cnt int, size int64) {
   374  	debug.Assert(size >= 0, xctn.String()) // "unsized" is caller's responsibility
   375  	xctn.stats.inobjs.Add(int64(cnt))
   376  	xctn.stats.inbytes.Add(size)
   377  }
   378  
   379  // provided for external use to fill-in xaction-specific `SnapExt` part
   380  func (xctn *Base) ToSnap(snap *core.Snap) {
   381  	snap.ID = xctn.ID()
   382  	snap.Kind = xctn.Kind()
   383  	snap.StartTime = xctn.StartTime()
   384  	snap.EndTime = xctn.EndTime()
   385  	if err := xctn.AbortErr(); err != nil {
   386  		snap.AbortErr = err.Error()
   387  		snap.AbortedX = true
   388  	}
   389  	snap.Err = xctn.err.Error() // TODO: a (verbose) option to respond with xctn.err.JoinErr() :NOTE
   390  	if b := xctn.Bck(); b != nil {
   391  		snap.Bck = b.Clone()
   392  	}
   393  
   394  	// counters
   395  	xctn.ToStats(&snap.Stats)
   396  }
   397  
   398  func (xctn *Base) ToStats(stats *core.Stats) {
   399  	stats.Objs = xctn.Objs()         // locally processed
   400  	stats.Bytes = xctn.Bytes()       //
   401  	stats.OutObjs = xctn.OutObjs()   // transmit
   402  	stats.OutBytes = xctn.OutBytes() //
   403  	stats.InObjs = xctn.InObjs()     // receive
   404  	stats.InBytes = xctn.InBytes()
   405  }
   406  
   407  // RebID helpers
   408  
   409  func RebID2S(id int64) string          { return fmt.Sprintf("g%d", id) }
   410  func S2RebID(id string) (int64, error) { return strconv.ParseInt(id[1:], 10, 64) }
   411  
   412  func IsValidRebID(id string) (valid bool) {
   413  	if len(id) > 1 {
   414  		_, err := S2RebID(id)
   415  		valid = err == nil
   416  	}
   417  	return
   418  }
   419  
   420  func CompareRebIDs(someID, fltID string) int {
   421  	ai, err := S2RebID(someID)
   422  	if err != nil {
   423  		return -1 // m.b. less than
   424  	}
   425  	bi, err := S2RebID(fltID)
   426  	debug.Assert(err == nil, fltID)
   427  	if ai < bi {
   428  		return -1
   429  	}
   430  	if ai > bi {
   431  		return 1
   432  	}
   433  	return 0
   434  }