github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/xact/base.go (about) 1 // Package xact provides core functionality for the AIStore eXtended Actions (xactions). 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package xact 6 7 import ( 8 "fmt" 9 "strconv" 10 "strings" 11 "sync" 12 ratomic "sync/atomic" 13 "time" 14 15 "github.com/NVIDIA/aistore/api/apc" 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/atomic" 18 "github.com/NVIDIA/aistore/cmn/cos" 19 "github.com/NVIDIA/aistore/cmn/debug" 20 "github.com/NVIDIA/aistore/cmn/nlog" 21 "github.com/NVIDIA/aistore/core" 22 "github.com/NVIDIA/aistore/core/meta" 23 "github.com/NVIDIA/aistore/fs" 24 "github.com/NVIDIA/aistore/nl" 25 ) 26 27 type ( 28 Base struct { 29 notif *NotifXact 30 bck meta.Bck 31 id string 32 kind string 33 _nam string 34 sutime atomic.Int64 35 eutime atomic.Int64 36 abort struct { 37 ch chan error 38 err ratomic.Pointer[error] 39 done atomic.Bool 40 } 41 stats struct { 42 objs atomic.Int64 // locally processed 43 bytes atomic.Int64 44 outobjs atomic.Int64 // transmit 45 outbytes atomic.Int64 46 inobjs atomic.Int64 // receive 47 inbytes atomic.Int64 48 } 49 err cos.Errs 50 } 51 Marked struct { 52 Xact core.Xact 53 Interrupted bool // (rebalance | resilver) interrupted 54 Restarted bool // node restarted 55 } 56 ) 57 58 var IncFinished func() 59 60 // common helper to go-run and wait until it actually starts running 61 func GoRunW(xctn core.Xact) { 62 wg := &sync.WaitGroup{} 63 wg.Add(1) 64 go xctn.Run(wg) 65 wg.Wait() 66 } 67 68 func IsValidUUID(id string) bool { return cos.IsValidUUID(id) || IsValidRebID(id) } 69 70 ////////////// 71 // Base - partially implements `core.Xact` interface 72 ////////////// 73 74 func (xctn *Base) InitBase(id, kind string, bck *meta.Bck) { 75 debug.Assert(kind == apc.ActETLInline || cos.IsValidUUID(id) || IsValidRebID(id), id) 76 debug.Assert(IsValidKind(kind), kind) 77 xctn.id, xctn.kind = id, kind 78 xctn.abort.ch = make(chan error, 1) 79 if bck != nil { 80 xctn.bck = *bck 81 } 82 xctn.setStartTime(time.Now()) 83 84 // name never changes 85 xctn._nam = "x-" + xctn.Kind() + LeftID + xctn.ID() + RightID 86 if !xctn.bck.IsEmpty() { 87 xctn._nam += "-" + xctn.bck.Cname("") 88 } 89 } 90 91 func (xctn *Base) ID() string { return xctn.id } 92 func (xctn *Base) Kind() string { return xctn.kind } 93 94 func (xctn *Base) Bck() *meta.Bck { return &xctn.bck } 95 96 func (xctn *Base) Finished() bool { return xctn.eutime.Load() != 0 } 97 98 func (xctn *Base) Running() (yes bool) { 99 yes = xctn.sutime.Load() != 0 && !xctn.Finished() && !xctn.IsAborted() 100 debug.Assert(!yes || xctn.ID() != "", xctn.String()) 101 return 102 } 103 104 func (xctn *Base) IsIdle() bool { return !xctn.Running() } 105 106 func (*Base) FromTo() (*meta.Bck, *meta.Bck) { return nil, nil } 107 108 // 109 // aborting 110 // 111 112 func (xctn *Base) ChanAbort() <-chan error { return xctn.abort.ch } 113 114 func (xctn *Base) IsAborted() bool { return xctn.abort.done.Load() } 115 116 func (xctn *Base) AbortErr() error { 117 if !xctn.IsAborted() { 118 return nil 119 } 120 // (is aborted) 121 // normally, is expected to return `abort.err` without any sleep 122 // but may also poll up to 4 times for 1s total 123 const wait = time.Second 124 sleep := cos.ProbingFrequency(wait) 125 for elapsed := time.Duration(0); elapsed < wait; elapsed += sleep { 126 perr := xctn.abort.err.Load() 127 if perr != nil { 128 return *perr 129 } 130 time.Sleep(sleep) 131 } 132 return cmn.NewErrAborted(xctn.Name(), "base.abort-err.timeout", nil) 133 } 134 135 func (xctn *Base) AbortedAfter(d time.Duration) (err error) { 136 sleep := cos.ProbingFrequency(d) 137 for elapsed := time.Duration(0); elapsed < d; elapsed += sleep { 138 if err = xctn.AbortErr(); err != nil { 139 break 140 } 141 time.Sleep(sleep) 142 } 143 return 144 } 145 146 func (xctn *Base) Abort(err error) bool { 147 if xctn.Finished() || !xctn.abort.done.CAS(false, true) { 148 return false 149 } 150 151 if err == nil { 152 err = cmn.ErrXactUserAbort // NOTE: only user can cause no-errors abort 153 } else if errAborted := cmn.AsErrAborted(err); errAborted != nil { 154 if errCause := errAborted.Unwrap(); errCause != nil { 155 err = errCause 156 } 157 } 158 perr := xctn.abort.err.Swap(&err) 159 debug.Assert(perr == nil, xctn.String()) 160 debug.Assert(len(xctn.abort.ch) == 0, xctn.String()) // CAS above 161 162 xctn.abort.ch <- err 163 close(xctn.abort.ch) 164 165 if xctn.Kind() != apc.ActList { 166 nlog.InfoDepth(1, xctn.Name(), err) 167 } 168 return true 169 } 170 171 // 172 // multi-error 173 // 174 175 func (xctn *Base) AddErr(err error, logExtra ...int) { 176 if xctn.IsAborted() { // no more errors once aborted 177 return 178 } 179 debug.Assert(err != nil) 180 fs.CleanPathErr(err) 181 xctn.err.Add(err) 182 // just add 183 if len(logExtra) == 0 { 184 return 185 } 186 // log error 187 level := logExtra[0] 188 if level == 0 { 189 nlog.ErrorDepth(1, err) 190 return 191 } 192 // finally, FastV 193 module := logExtra[1] 194 if cmn.Rom.FastV(level, module) { 195 nlog.InfoDepth(1, "Warning:", err) 196 } 197 } 198 199 func (xctn *Base) Err() error { 200 if xctn.ErrCnt() == 0 { 201 return nil 202 } 203 return &xctn.err 204 } 205 206 func (xctn *Base) JoinErr() (int, error) { return xctn.err.JoinErr() } 207 func (xctn *Base) ErrCnt() int { return xctn.err.Cnt() } 208 209 // count all the way to duration; reset and adjust every time activity is detected 210 func (xctn *Base) Quiesce(d time.Duration, cb core.QuiCB) core.QuiRes { 211 var ( 212 idle, total time.Duration 213 sleep = cos.ProbingFrequency(d) 214 dur = d 215 ) 216 if xctn.IsAborted() { 217 return core.QuiAborted 218 } 219 for idle < dur { 220 time.Sleep(sleep) 221 if xctn.IsAborted() { 222 return core.QuiAborted 223 } 224 total += sleep 225 switch res := cb(total); res { 226 case core.QuiInactiveCB: // NOTE: used by callbacks, converts to one of the returned codes 227 idle += sleep 228 case core.QuiActive: 229 idle = 0 // reset 230 dur = min(dur+sleep, 2*d) // bump up to 2x initial 231 case core.QuiActiveRet: 232 return core.QuiActiveRet 233 case core.QuiDone: 234 return core.QuiDone 235 case core.QuiTimeout: 236 return core.QuiTimeout 237 } 238 } 239 return core.Quiescent 240 } 241 242 func (xctn *Base) Cname() string { return Cname(xctn.Kind(), xctn.ID()) } 243 244 func (xctn *Base) Name() (s string) { return xctn._nam } 245 246 func (xctn *Base) _sb() (sb strings.Builder) { 247 sb.WriteString(xctn._nam) 248 sb.WriteByte('-') 249 sb.WriteString(cos.FormatTime(xctn.StartTime(), cos.StampMicro)) 250 251 if !xctn.Finished() { // ok to (rarely) miss _aborted_ state as this is purely informational 252 return sb 253 } 254 etime := cos.FormatTime(xctn.EndTime(), cos.StampMicro) 255 if xctn.IsAborted() { 256 sb.WriteString(fmt.Sprintf("-[abrt: %v]", xctn.AbortErr())) 257 } 258 sb.WriteByte('-') 259 sb.WriteString(etime) 260 return sb 261 } 262 263 func (xctn *Base) String() string { 264 sb := xctn._sb() 265 return sb.String() 266 } 267 268 func (xctn *Base) StartTime() time.Time { 269 u := xctn.sutime.Load() 270 if u != 0 { 271 return time.Unix(0, u) 272 } 273 return time.Time{} 274 } 275 276 func (xctn *Base) setStartTime(s time.Time) { xctn.sutime.Store(s.UnixNano()) } 277 278 func (xctn *Base) EndTime() time.Time { 279 u := xctn.eutime.Load() 280 if u != 0 { 281 return time.Unix(0, u) 282 } 283 return time.Time{} 284 } 285 286 // upon completion, all xactions optionally notify listener(s) and refresh local capacity stats 287 func (xctn *Base) onFinished(err error, aborted bool) { 288 // notifications 289 if xctn.notif != nil { 290 nl.OnFinished(xctn.notif, err, aborted) 291 } 292 xactRecord := Table[xctn.kind] 293 if xactRecord.RefreshCap { 294 // currently, ignoring returned err-cap and not calling t.OOS() 295 // both (conditions) handled by periodic stats 296 fs.CapRefresh(nil /*config*/, nil /*tcdf*/) 297 } 298 299 IncFinished() // in re: HK cleanup long-time finished 300 } 301 302 func (xctn *Base) AddNotif(n core.Notif) { 303 xctn.notif = n.(*NotifXact) 304 debug.Assert(xctn.notif.Xact != nil && xctn.notif.F != nil) // always fin-notif and points to self 305 debug.Assert(!n.Upon(core.UponProgress) || xctn.notif.P != nil) // progress notification is optional 306 } 307 308 // atomically set end-time 309 func (xctn *Base) Finish() { 310 var ( 311 err error 312 info string 313 aborted bool 314 ) 315 if !xctn.eutime.CAS(0, 1) { 316 return 317 } 318 xctn.eutime.Store(time.Now().UnixNano()) 319 if aborted = xctn.IsAborted(); aborted { 320 if perr := xctn.abort.err.Load(); perr != nil { 321 err = *perr 322 } 323 } 324 if xctn.ErrCnt() > 0 { 325 if err == nil { 326 debug.Assert(!aborted) 327 err = xctn.Err() 328 } else { 329 // abort takes precedence 330 info = "(" + xctn.Err().Error() + ")" 331 } 332 } 333 xctn.onFinished(err, aborted) 334 // log 335 switch { 336 case xctn.Kind() == apc.ActList: 337 case err == nil: 338 nlog.Infoln(xctn.String(), "finished") 339 case aborted: 340 nlog.Warningln(xctn.String(), "aborted:", err.Error(), info) 341 default: 342 nlog.Infoln("Warning:", xctn.String(), "finished w/err:", err.Error()) 343 } 344 } 345 346 // base stats: locally processed 347 func (xctn *Base) Objs() int64 { return xctn.stats.objs.Load() } 348 func (xctn *Base) Bytes() int64 { return xctn.stats.bytes.Load() } 349 350 func (xctn *Base) ObjsAdd(cnt int, size int64) { 351 xctn.stats.objs.Add(int64(cnt)) 352 xctn.stats.bytes.Add(size) 353 } 354 355 // oft. used 356 func (xctn *Base) LomAdd(lom *core.LOM) { xctn.ObjsAdd(1, lom.SizeBytes(true)) } 357 358 // base stats: transmit 359 func (xctn *Base) OutObjs() int64 { return xctn.stats.outobjs.Load() } 360 func (xctn *Base) OutBytes() int64 { return xctn.stats.outbytes.Load() } 361 362 func (xctn *Base) OutObjsAdd(cnt int, size int64) { 363 xctn.stats.outobjs.Add(int64(cnt)) 364 if size > 0 { // not unsized 365 xctn.stats.outbytes.Add(size) 366 } 367 } 368 369 // base stats: receive 370 func (xctn *Base) InObjs() int64 { return xctn.stats.inobjs.Load() } 371 func (xctn *Base) InBytes() int64 { return xctn.stats.inbytes.Load() } 372 373 func (xctn *Base) InObjsAdd(cnt int, size int64) { 374 debug.Assert(size >= 0, xctn.String()) // "unsized" is caller's responsibility 375 xctn.stats.inobjs.Add(int64(cnt)) 376 xctn.stats.inbytes.Add(size) 377 } 378 379 // provided for external use to fill-in xaction-specific `SnapExt` part 380 func (xctn *Base) ToSnap(snap *core.Snap) { 381 snap.ID = xctn.ID() 382 snap.Kind = xctn.Kind() 383 snap.StartTime = xctn.StartTime() 384 snap.EndTime = xctn.EndTime() 385 if err := xctn.AbortErr(); err != nil { 386 snap.AbortErr = err.Error() 387 snap.AbortedX = true 388 } 389 snap.Err = xctn.err.Error() // TODO: a (verbose) option to respond with xctn.err.JoinErr() :NOTE 390 if b := xctn.Bck(); b != nil { 391 snap.Bck = b.Clone() 392 } 393 394 // counters 395 xctn.ToStats(&snap.Stats) 396 } 397 398 func (xctn *Base) ToStats(stats *core.Stats) { 399 stats.Objs = xctn.Objs() // locally processed 400 stats.Bytes = xctn.Bytes() // 401 stats.OutObjs = xctn.OutObjs() // transmit 402 stats.OutBytes = xctn.OutBytes() // 403 stats.InObjs = xctn.InObjs() // receive 404 stats.InBytes = xctn.InBytes() 405 } 406 407 // RebID helpers 408 409 func RebID2S(id int64) string { return fmt.Sprintf("g%d", id) } 410 func S2RebID(id string) (int64, error) { return strconv.ParseInt(id[1:], 10, 64) } 411 412 func IsValidRebID(id string) (valid bool) { 413 if len(id) > 1 { 414 _, err := S2RebID(id) 415 valid = err == nil 416 } 417 return 418 } 419 420 func CompareRebIDs(someID, fltID string) int { 421 ai, err := S2RebID(someID) 422 if err != nil { 423 return -1 // m.b. less than 424 } 425 bi, err := S2RebID(fltID) 426 debug.Assert(err == nil, fltID) 427 if ai < bi { 428 return -1 429 } 430 if ai > bi { 431 return 1 432 } 433 return 0 434 }