github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/txn.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "errors" 9 "fmt" 10 "sync" 11 ratomic "sync/atomic" 12 "time" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/cos" 17 "github.com/NVIDIA/aistore/cmn/debug" 18 "github.com/NVIDIA/aistore/cmn/mono" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/core/meta" 22 "github.com/NVIDIA/aistore/hk" 23 "github.com/NVIDIA/aistore/xact/xs" 24 ) 25 26 // GC 27 const ( 28 gcTxnsInterval = time.Hour 29 gcTxnsNumKeep = 16 30 gcTxnsTimeotMult = 10 31 32 TxnTimeoutMult = 2 33 ) 34 35 type ( 36 txn interface { 37 // accessors 38 uuid() string 39 started(phase string, tm ...time.Time) time.Time 40 isDone() (done bool, err error) 41 set(nlps []core.NLP) 42 // triggers 43 commitAfter(caller string, msg *aisMsg, err error, args ...any) (bool, error) 44 rsvp(err error) 45 // cleanup 46 abort(error) 47 unlock() 48 // log 49 String() string 50 } 51 rndzvs struct { // rendezvous records 52 timestamp int64 53 err *txnError 54 callerName string 55 } 56 // two maps, two locks 57 transactions struct { 58 t *target 59 m map[string]txn // by txn.uuid 60 rendezvous struct { 61 m map[string]rndzvs // ditto 62 mtx sync.Mutex 63 } 64 mtx sync.Mutex 65 } 66 txnError struct { // a wrapper which presence means: "done" 67 err error 68 } 69 txnBase struct { // generic base 70 phase struct { 71 begin time.Time 72 commit time.Time 73 } 74 xctn core.Xact 75 err ratomic.Pointer[txnError] 76 action string 77 callerName string 78 callerID string 79 uid string 80 smapVer int64 81 bmdVer int64 82 sync.RWMutex 83 } 84 txnBckBase struct { 85 bck meta.Bck 86 nlps []core.NLP 87 txnBase 88 } 89 90 // 91 // concrete transaction types 92 // 93 txnCreateBucket struct { 94 txnBckBase 95 } 96 txnMakeNCopies struct { 97 txnBckBase 98 curCopies int64 99 newCopies int64 100 } 101 txnSetBucketProps struct { 102 bprops *cmn.Bprops 103 nprops *cmn.Bprops 104 txnBckBase 105 } 106 txnRenameBucket struct { 107 bckFrom *meta.Bck 108 bckTo *meta.Bck 109 txnBckBase 110 } 111 txnTCB struct { 112 xtcb *xs.XactTCB 113 txnBckBase 114 } 115 txnTCObjs struct { 116 xtco *xs.XactTCObjs 117 msg *cmn.TCObjsMsg 118 txnBckBase 119 } 120 txnECEncode struct { 121 txnBckBase 122 } 123 txnArchMultiObj struct { 124 xarch *xs.XactArch 125 msg *cmn.ArchiveBckMsg 126 txnBckBase 127 } 128 txnPromote struct { 129 msg *apc.PromoteArgs 130 xprm *xs.XactDirPromote 131 dirFQN string 132 fqns []string 133 txnBckBase 134 totalN int 135 fshare bool 136 } 137 ) 138 139 // interface guard 140 var ( 141 _ txn = (*txnBckBase)(nil) 142 _ txn = (*txnCreateBucket)(nil) 143 _ txn = (*txnMakeNCopies)(nil) 144 _ txn = (*txnSetBucketProps)(nil) 145 _ txn = (*txnRenameBucket)(nil) 146 _ txn = (*txnTCB)(nil) 147 _ txn = (*txnTCObjs)(nil) 148 _ txn = (*txnECEncode)(nil) 149 _ txn = (*txnPromote)(nil) 150 ) 151 152 ////////////////// 153 // transactions // 154 ////////////////// 155 156 func (txns *transactions) init(t *target) { 157 txns.t = t 158 txns.m = make(map[string]txn, 8) 159 txns.rendezvous.m = make(map[string]rndzvs, 8) 160 hk.Reg("txn"+hk.NameSuffix, txns.housekeep, gcTxnsInterval) 161 } 162 163 func (txns *transactions) begin(txn txn, nlps ...core.NLP) (err error) { 164 txns.mtx.Lock() 165 if x, ok := txns.m[txn.uuid()]; ok { 166 txns.mtx.Unlock() 167 for _, nlp := range nlps { 168 nlp.Unlock() 169 } 170 err = fmt.Errorf("%s: %s already exists (duplicate uuid?)", txns.t.si, x) 171 debug.AssertNoErr(err) 172 return 173 } 174 txn.started(apc.ActBegin, time.Now()) 175 txn.set(nlps) 176 txns.m[txn.uuid()] = txn 177 txns.mtx.Unlock() 178 179 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 180 nlog.Infof("%s begin: %s", txns.t, txn) 181 } 182 return 183 } 184 185 func (txns *transactions) find(uuid, act string) (txn, error) { 186 txns.mtx.Lock() 187 txn, ok := txns.m[uuid] 188 if !ok { 189 // a) not found (benign in an unlikely event of failing to commit) 190 txns.mtx.Unlock() 191 return nil, cos.NewErrNotFound(txns.t, "txn "+uuid) 192 } 193 194 if act == "" { 195 // b) just find & return 196 txns.mtx.Unlock() 197 return txn, nil 198 } 199 200 // or c) cleanup 201 delete(txns.m, uuid) 202 txns.mtx.Unlock() 203 204 txns.rendezvous.mtx.Lock() 205 delete(txns.rendezvous.m, uuid) 206 txns.rendezvous.mtx.Unlock() 207 208 if act == apc.ActAbort { 209 txn.abort(errors.New("action: abort")) // NOTE: may call txn-specific abort, e.g. TxnAbort 210 } else { 211 debug.Assert(act == apc.ActCommit || act == ActCleanup, act) 212 txn.unlock() 213 } 214 215 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 216 nlog.Infof("%s %s: %s", txns.t, act, txn) 217 } 218 return txn, nil 219 } 220 221 func (txns *transactions) commitBefore(caller string, msg *aisMsg) error { 222 var ( 223 rndzvs rndzvs 224 ok bool 225 ) 226 txns.rendezvous.mtx.Lock() 227 if rndzvs, ok = txns.rendezvous.m[msg.UUID]; !ok { 228 rndzvs.callerName, rndzvs.timestamp = caller, mono.NanoTime() 229 txns.rendezvous.m[msg.UUID] = rndzvs 230 txns.rendezvous.mtx.Unlock() 231 return nil 232 } 233 txns.rendezvous.mtx.Unlock() 234 return fmt.Errorf("rendezvous record %s:%d already exists", msg.UUID, rndzvs.timestamp) 235 } 236 237 func (txns *transactions) commitAfter(caller string, msg *aisMsg, err error, args ...any) (errDone error) { 238 txns.mtx.Lock() 239 txn, ok := txns.m[msg.UUID] 240 txns.mtx.Unlock() 241 242 var running bool 243 if ok { 244 // Ignore downgrade error. 245 if isErrDowngrade(err) { 246 err = nil 247 bmd := txns.t.owner.bmd.get() 248 nlog.Warningf("%s: commit with downgraded (current: %s)", txn, bmd) 249 } 250 if running, errDone = txn.commitAfter(caller, msg, err, args...); running { 251 nlog.Infoln(txn.String()) 252 } 253 } 254 if !running { 255 txns.rendezvous.mtx.Lock() 256 rndzvs, ok := txns.rendezvous.m[msg.UUID] 257 if !ok { // can't happen 258 txns.rendezvous.mtx.Unlock() 259 errDone = cos.NewErrNotFound(txns.t, "rendezvous record "+msg.UUID) 260 return 261 } 262 rndzvs.err = &txnError{err: err} 263 txns.rendezvous.m[msg.UUID] = rndzvs 264 txns.rendezvous.mtx.Unlock() 265 } 266 return 267 } 268 269 // given txn, wait for its completion, handle timeout, and ultimately remove 270 func (txns *transactions) wait(txn txn, timeoutNetw, timeoutHost time.Duration) (err error) { 271 // timestamp 272 txn.started(apc.ActCommit, time.Now()) 273 274 // transfer err rendezvous => txn 275 txns.rendezvous.mtx.Lock() 276 rndzvs, ok := txns.rendezvous.m[txn.uuid()] 277 txns.rendezvous.mtx.Unlock() 278 if ok && rndzvs.err != nil { 279 txn.rsvp(rndzvs.err.err) 280 } 281 282 err = txns._wait(txn, timeoutNetw, timeoutHost) 283 284 // cleanup or abort, depending on the returned err 285 act := apc.ActCommit 286 if err != nil { 287 act = apc.ActAbort 288 } 289 txns.find(txn.uuid(), act) 290 return err 291 } 292 293 // poll for 'done' 294 func (txns *transactions) _wait(txn txn, timeoutNetw, timeoutHost time.Duration) (err error) { 295 var ( 296 sleep = 100 * time.Millisecond 297 done, found bool 298 ) 299 for total := sleep; ; { 300 if done, err = txn.isDone(); done { 301 return err 302 } 303 // aborted? 304 if _, err = txns.find(txn.uuid(), ""); err != nil { 305 return err 306 } 307 308 time.Sleep(sleep) 309 total += sleep 310 // bump once 311 if total == sleep<<4 { 312 sleep *= 4 313 } 314 // must be ready for rendezvous 315 if !found { 316 txns.rendezvous.mtx.Lock() 317 _, found = txns.rendezvous.m[txn.uuid()] 318 txns.rendezvous.mtx.Unlock() 319 } 320 // two timeouts 321 if found { 322 // config.Timeout.MaxHostBusy (see p.prepTxnClient) 323 if timeoutHost != 0 && total > timeoutHost { 324 err = errors.New("timed out waiting for txn to complete") 325 break 326 } 327 } else if timeoutNetw != 0 && total > timeoutNetw { // 2 * config.Timeout.MaxKeepalive (see p.prepTxnClient) 328 err = errors.New("timed out waiting for commit message") 329 break 330 } 331 } 332 return err 333 } 334 335 // GC orphaned transactions 336 func (txns *transactions) housekeep() (d time.Duration) { 337 var ( 338 errs []error 339 orphans []txn 340 config = cmn.GCO.Get() 341 ) 342 d = gcTxnsInterval 343 txns.mtx.Lock() 344 l := len(txns.m) 345 if l == 0 { 346 txns.mtx.Unlock() 347 return 348 } 349 if l > max(gcTxnsNumKeep*4, 16) { 350 d = gcTxnsInterval / 10 351 } 352 now := time.Now() 353 for _, txn := range txns.m { 354 err, warn := checkTimeout(txn, now, config) 355 if err != nil { 356 errs = append(errs, err) 357 txn.abort(err) 358 delete(txns.m, txn.uuid()) 359 orphans = append(orphans, txn) 360 } else if warn != nil { 361 errs = append(errs, warn) 362 } 363 } 364 txns.mtx.Unlock() 365 366 if len(orphans) > 0 || len(errs) > 0 { 367 go txns.cleanup(orphans, errs) 368 } 369 return 370 } 371 372 func (txns *transactions) cleanup(orphans []txn, errs []error) { 373 if len(orphans) > 0 { 374 txns.rendezvous.mtx.Lock() 375 for _, txn := range orphans { 376 delete(txns.rendezvous.m, txn.uuid()) 377 } 378 txns.rendezvous.mtx.Unlock() 379 } 380 for _, e := range errs { 381 nlog.Errorln(e) 382 } 383 } 384 385 func checkTimeout(txn txn, now time.Time, config *cmn.Config) (err, warn error) { 386 elapsed := now.Sub(txn.started(apc.ActBegin)) 387 if commitTimestamp := txn.started(apc.ActCommit); !commitTimestamp.IsZero() { 388 elapsed = now.Sub(commitTimestamp) 389 if elapsed > gcTxnsTimeotMult*config.Timeout.MaxHostBusy.D() { 390 err = fmt.Errorf("gc %s: [commit - done] timeout", txn) 391 } else if elapsed >= TxnTimeoutMult*config.Timeout.MaxHostBusy.D() { 392 err = fmt.Errorf("gc %s: commit is taking too long", txn) 393 } 394 } else { 395 if elapsed > TxnTimeoutMult*config.Timeout.MaxHostBusy.D() { 396 err = fmt.Errorf("gc %s: [begin - start-commit] timeout", txn) 397 } else if elapsed >= TxnTimeoutMult*cmn.Rom.MaxKeepalive() { 398 warn = fmt.Errorf("gc %s: commit message is taking too long", txn) 399 } 400 } 401 return 402 } 403 404 ///////////// 405 // txnBase // 406 ///////////// 407 408 func (txn *txnBase) uuid() string { return txn.uid } 409 410 func (txn *txnBase) started(phase string, tm ...time.Time) (ts time.Time) { 411 switch phase { 412 case apc.ActBegin: 413 if len(tm) > 0 { 414 txn.phase.begin = tm[0] 415 } 416 ts = txn.phase.begin 417 case apc.ActCommit: 418 if len(tm) > 0 { 419 txn.phase.commit = tm[0] 420 } 421 ts = txn.phase.commit 422 default: 423 debug.Assert(false) 424 } 425 return 426 } 427 428 func (txn *txnBase) isDone() (done bool, err error) { 429 if txnErr := txn.err.Load(); txnErr != nil { 430 err = txnErr.err 431 done = true 432 } 433 return 434 } 435 436 func (txn *txnBase) rsvp(err error) { txn.err.Store(&txnError{err: err}) } 437 438 func (txn *txnBase) fillFromCtx(c *txnSrv) { 439 txn.uid = c.uuid 440 txn.action = c.msg.Action 441 txn.callerName = c.callerName 442 txn.callerID = c.callerID 443 txn.smapVer = c.t.owner.smap.get().version() 444 txn.bmdVer = c.t.owner.bmd.get().version() 445 } 446 447 //////////////// 448 // txnBckBase // 449 //////////////// 450 451 func newTxnBckBase(bck *meta.Bck) (txn *txnBckBase) { 452 txn = &txnBckBase{} 453 txn.init(bck) 454 return 455 } 456 457 func (txn *txnBckBase) init(bck *meta.Bck) { txn.bck = *bck } 458 459 func (txn *txnBckBase) set(nlps []core.NLP) { 460 txn.nlps = nlps 461 } 462 463 func (txn *txnBckBase) unlock() { 464 for _, p := range txn.nlps { 465 p.Unlock() 466 } 467 txn.nlps = txn.nlps[:0] 468 } 469 470 func (txn *txnBckBase) abort(err error) { 471 txn.unlock() 472 nlog.Infoln(txn.String(), "aborted:", err) 473 } 474 475 func (txn *txnBckBase) String() string { 476 var res, tm string 477 if done, err := txn.isDone(); done { 478 if err == nil { 479 res = " done" 480 } else { 481 res = fmt.Sprintf(" fail(%v)", err) 482 } 483 } 484 if txn.xctn != nil { 485 return fmt.Sprintf("txn-%s%s", txn.xctn, res) 486 } 487 if !txn.phase.commit.IsZero() { 488 tm = "-" + cos.FormatTime(txn.phase.commit, cos.StampMicro) 489 } 490 return fmt.Sprintf("txn-%s[%s]-%s%s%s]", txn.action, txn.uid, txn.bck.Bucket().String(), tm, res) 491 } 492 493 func (txn *txnBckBase) commitAfter(caller string, msg *aisMsg, err error, args ...any) (found bool, errDone error) { 494 if txn.callerName != caller || msg.UUID != txn.uuid() { 495 return 496 } 497 found = true 498 debug.Func(func() { 499 bmd, _ := args[0].(*bucketMD) 500 debug.Assert(bmd.version() >= txn.bmdVer) 501 }) 502 if txnErr := txn.err.Swap(&txnError{err: err}); txnErr != nil { 503 errDone = fmt.Errorf("%s: already done with err=%v (%v)", txn, txnErr.err, err) 504 txn.err.Store(txnErr) 505 } 506 return 507 } 508 509 ///////////////////// 510 // txnCreateBucket // 511 ///////////////////// 512 513 func newTxnCreateBucket(c *txnSrv) (txn *txnCreateBucket) { 514 txn = &txnCreateBucket{} 515 txn.init(c.bck) 516 txn.fillFromCtx(c) 517 return 518 } 519 520 //////////////////// 521 // txnMakeNCopies // 522 //////////////////// 523 524 func newTxnMakeNCopies(c *txnSrv, curCopies, newCopies int64) (txn *txnMakeNCopies) { 525 txn = &txnMakeNCopies{curCopies: curCopies, newCopies: newCopies} 526 txn.init(c.bck) 527 txn.fillFromCtx(c) 528 return 529 } 530 531 func (txn *txnMakeNCopies) String() string { 532 s := txn.txnBckBase.String() 533 return fmt.Sprintf("%s-copies(%d=>%d)", s, txn.curCopies, txn.newCopies) 534 } 535 536 /////////////////////// 537 // txnSetBucketProps // 538 /////////////////////// 539 540 func newTxnSetBucketProps(c *txnSrv, nprops *cmn.Bprops) (txn *txnSetBucketProps) { 541 cos.Assert(c.bck.Props != nil) 542 bprops := c.bck.Props.Clone() 543 txn = &txnSetBucketProps{bprops: bprops, nprops: nprops} 544 txn.init(c.bck) 545 txn.fillFromCtx(c) 546 return 547 } 548 549 ///////////////////// 550 // txnRenameBucket // 551 ///////////////////// 552 553 func newTxnRenameBucket(c *txnSrv, bckFrom, bckTo *meta.Bck) (txn *txnRenameBucket) { 554 txn = &txnRenameBucket{bckFrom: bckFrom, bckTo: bckTo} 555 txn.init(bckFrom) 556 txn.fillFromCtx(c) 557 return 558 } 559 560 //////////// 561 // txnTCB // 562 //////////// 563 564 func newTxnTCB(c *txnSrv, xtcb *xs.XactTCB) (txn *txnTCB) { 565 txn = &txnTCB{xtcb: xtcb} 566 txn.init(xtcb.Args().BckFrom) 567 txn.fillFromCtx(c) 568 return 569 } 570 571 func (txn *txnTCB) abort(err error) { 572 txn.unlock() 573 txn.xtcb.TxnAbort(err) 574 } 575 576 func (txn *txnTCB) String() string { 577 txn.xctn = txn.xtcb 578 return txn.txnBckBase.String() 579 } 580 581 /////////////// 582 // txnTCObjs // 583 /////////////// 584 585 func newTxnTCObjs(c *txnSrv, bckFrom *meta.Bck, xtco *xs.XactTCObjs, msg *cmn.TCObjsMsg) (txn *txnTCObjs) { 586 txn = &txnTCObjs{xtco: xtco, msg: msg} 587 txn.init(bckFrom) 588 txn.fillFromCtx(c) 589 return 590 } 591 592 func (txn *txnTCObjs) abort(err error) { 593 txn.unlock() 594 txn.xtco.TxnAbort(err) 595 } 596 597 func (txn *txnTCObjs) String() string { 598 txn.xctn = txn.xtco 599 return txn.txnBckBase.String() 600 } 601 602 ///////////////// 603 // txnECEncode // 604 ///////////////// 605 606 func newTxnECEncode(c *txnSrv, bck *meta.Bck) (txn *txnECEncode) { 607 txn = &txnECEncode{} 608 txn.init(bck) 609 txn.fillFromCtx(c) 610 return 611 } 612 613 /////////////////////////// 614 // txnCreateArchMultiObj // 615 /////////////////////////// 616 617 func newTxnArchMultiObj(c *txnSrv, bckFrom *meta.Bck, xarch *xs.XactArch, msg *cmn.ArchiveBckMsg) (txn *txnArchMultiObj) { 618 txn = &txnArchMultiObj{xarch: xarch, msg: msg} 619 txn.init(bckFrom) 620 txn.fillFromCtx(c) 621 return 622 } 623 624 func (txn *txnArchMultiObj) abort(err error) { 625 txn.unlock() 626 txn.xarch.TxnAbort(err) 627 } 628 629 func (txn *txnArchMultiObj) String() string { 630 txn.xctn = txn.xarch 631 return txn.txnBckBase.String() 632 } 633 634 //////////////// 635 // txnPromote // 636 //////////////// 637 638 func newTxnPromote(c *txnSrv, msg *apc.PromoteArgs, fqns []string, dirFQN string, totalN int) (txn *txnPromote) { 639 txn = &txnPromote{msg: msg, fqns: fqns, dirFQN: dirFQN, totalN: totalN} 640 txn.init(c.bck) 641 txn.fillFromCtx(c) 642 return 643 } 644 645 func (txn *txnPromote) String() (s string) { 646 txn.xctn = txn.xprm 647 return fmt.Sprintf("%s-src(%s)-N(%d)-fshare(%t)", txn.txnBckBase.String(), txn.dirFQN, txn.totalN, txn.fshare) 648 }