github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/globrun.go (about) 1 // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package reb 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "net/http" 12 "path/filepath" 13 "runtime" 14 "sync" 15 ratomic "sync/atomic" 16 "time" 17 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/atomic" 21 "github.com/NVIDIA/aistore/cmn/cos" 22 "github.com/NVIDIA/aistore/cmn/debug" 23 "github.com/NVIDIA/aistore/cmn/fname" 24 "github.com/NVIDIA/aistore/cmn/nlog" 25 "github.com/NVIDIA/aistore/cmn/prob" 26 "github.com/NVIDIA/aistore/core" 27 "github.com/NVIDIA/aistore/core/meta" 28 "github.com/NVIDIA/aistore/fs" 29 "github.com/NVIDIA/aistore/transport" 30 "github.com/NVIDIA/aistore/transport/bundle" 31 "github.com/NVIDIA/aistore/xact" 32 "github.com/NVIDIA/aistore/xact/xreg" 33 "github.com/NVIDIA/aistore/xact/xs" 34 jsoniter "github.com/json-iterator/go" 35 "golang.org/x/sync/errgroup" 36 ) 37 38 const ( 39 trname = "reb" 40 trnamePsh = "pshreb" // broadcast stage notifications 41 ) 42 43 // rebalance stage enum 44 const ( 45 rebStageInactive = iota 46 rebStageInit 47 rebStageTraverse 48 rebStageWaitAck 49 rebStageFin 50 rebStageFinStreams 51 rebStageDone 52 rebStageAbort // one of targets aborts the rebalancing (never set, only sent) 53 ) 54 55 const maxWackTargets = 4 56 57 var stages = map[uint32]string{ 58 rebStageInactive: "<inactive>", 59 rebStageInit: "<init>", 60 rebStageTraverse: "<traverse>", 61 rebStageWaitAck: "<wack>", 62 rebStageFin: "<fin>", 63 rebStageFinStreams: "<fin-streams>", 64 rebStageDone: "<done>", 65 rebStageAbort: "<abort>", 66 } 67 68 const fmtpend = "%s: newer rebalance[g%d] pending - not running" 69 70 type ( 71 Reb struct { 72 smap ratomic.Pointer[meta.Smap] // next smap (new that'll become current after rebalance) 73 xreb ratomic.Pointer[xs.Rebalance] 74 dm *bundle.DataMover 75 pushes *bundle.Streams // broadcast notifications 76 filterGFN *prob.Filter 77 semaCh *cos.Semaphore 78 ecClient *http.Client 79 stages *nodeStages 80 lomacks [cos.MultiSyncMapCount]*lomAcks 81 awaiting struct { 82 targets meta.Nodes // targets for which we are waiting for 83 ts int64 // last time we have recomputed 84 mtx sync.Mutex 85 } 86 // (smap, xreb) + atomic state 87 rebID atomic.Int64 88 nxtID atomic.Int64 89 inQueue atomic.Int64 90 onAir atomic.Int64 91 mu sync.RWMutex 92 laterx atomic.Bool 93 } 94 lomAcks struct { 95 mu *sync.Mutex 96 q map[string]*core.LOM // on the wire, waiting for ACK 97 } 98 joggerBase struct { 99 m *Reb 100 xreb *xs.Rebalance 101 wg *sync.WaitGroup 102 } 103 rebJogger struct { 104 joggerBase 105 smap *meta.Smap 106 opts fs.WalkOpts 107 ver int64 108 } 109 rebArgs struct { 110 smap *meta.Smap 111 config *cmn.Config 112 apaths fs.MPI 113 id int64 114 ecUsed bool 115 } 116 ) 117 118 func New(config *cmn.Config) *Reb { 119 var ( 120 reb = &Reb{ 121 filterGFN: prob.NewDefaultFilter(), 122 stages: newNodeStages(), 123 } 124 cargs = cmn.TransportArgs{Timeout: config.Client.Timeout.D()} 125 ) 126 if config.Net.HTTP.UseHTTPS { 127 reb.ecClient = cmn.NewIntraClientTLS(cargs, config) 128 } else { 129 reb.ecClient = cmn.NewClient(cargs) 130 } 131 dmExtra := bundle.Extra{ 132 RecvAck: reb.recvAck, 133 Config: config, 134 Compression: config.Rebalance.Compression, 135 Multiplier: config.Rebalance.SbundleMult, 136 } 137 dm, err := bundle.NewDataMover(trname, reb.recvObj, cmn.OwtRebalance, dmExtra) 138 if err != nil { 139 cos.ExitLog(err) 140 } 141 debug.Assert(dm != nil) 142 reb.dm = dm 143 144 // serialize one global rebalance at a time 145 reb.semaCh = cos.NewSemaphore(1) 146 return reb 147 } 148 149 func (reb *Reb) regRecv() { 150 if err := reb.dm.RegRecv(); err != nil { 151 cos.ExitLog(err) 152 } 153 if err := transport.Handle(trnamePsh, reb.recvStageNtfn /*RecvObj*/); err != nil { 154 cos.ExitLog(err) 155 } 156 } 157 158 func (reb *Reb) unregRecv() { 159 reb.dm.UnregRecv() 160 err := transport.Unhandle(trnamePsh) 161 debug.AssertNoErr(err) 162 } 163 164 // run sequence: non-EC and EC global 165 // 166 // main method: serialized to run one at a time and goes through controlled enumerated stages 167 // A note on stage management: 168 // 1. Non-EC and EC rebalances run in parallel 169 // 2. Execution starts after the `Reb` sets the current stage to rebStageTraverse 170 // 3. Only EC rebalance changes the current stage 171 // 4. Global rebalance performs checks such as `stage > rebStageTraverse` or 172 // `stage < rebStageWaitAck`. Since all EC stages are between 173 // `Traverse` and `WaitAck` non-EC rebalance does not "notice" stage changes. 174 func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact) { 175 if reb.nxtID.Load() >= id { 176 return 177 } 178 reb.mu.Lock() 179 if reb.nxtID.Load() >= id { 180 reb.mu.Unlock() 181 return 182 } 183 debug.Assert(id > reb.rebID.Load()) 184 reb.nxtID.Store(id) 185 reb.mu.Unlock() 186 187 logHdr := reb.logHdr(id, smap, true /*initializing*/) 188 nlog.Infoln(logHdr + ": initializing") 189 190 bmd := core.T.Bowner().Get() 191 rargs := &rebArgs{id: id, smap: smap, config: cmn.GCO.Get(), ecUsed: bmd.IsECUsed()} 192 if !reb.serialize(rargs, logHdr) { 193 return 194 } 195 196 reb.regRecv() 197 198 haveStreams := smap.HasActiveTs(core.T.SID()) 199 if bmd.IsEmpty() { 200 haveStreams = false 201 } 202 if !reb.initRenew(rargs, notif, logHdr, haveStreams) { 203 reb.unregRecv() 204 reb.semaCh.Release() 205 return 206 } 207 if !haveStreams { 208 // cleanup and leave 209 nlog.Infof("%s: nothing to do: %s, %s", logHdr, smap.StringEx(), bmd.StringEx()) 210 reb.stages.stage.Store(rebStageDone) 211 reb.unregRecv() 212 reb.semaCh.Release() 213 fs.RemoveMarker(fname.RebalanceMarker) 214 fs.RemoveMarker(fname.NodeRestartedPrev) 215 reb.xctn().Finish() 216 return 217 } 218 219 // abort all running `dtor.AbortRebRes` xactions (download, dsort, etl) 220 xreg.AbortByNewReb(errors.New("reason: starting " + reb.xctn().Name())) 221 222 // At this point, only one rebalance is running 223 224 onGFN() 225 226 errCnt := 0 227 err := reb.run(rargs) 228 if err == nil { 229 errCnt = reb.rebWaitAck(rargs) 230 } else { 231 nlog.Warningln(err) 232 } 233 reb.changeStage(rebStageFin) 234 235 for errCnt != 0 && !reb.xctn().IsAborted() { 236 errCnt = bcast(rargs, reb.waitFinExtended) 237 } 238 239 reb.fini(rargs, logHdr, err) 240 241 offGFN() 242 } 243 244 // To optimize goroutine creation: 245 // 1. One bucket case just calls a single rebalance worker depending on 246 // whether a bucket is erasure coded (goroutine is not used). 247 // 2. Multi-bucket rebalance may start both non-EC and EC in parallel. 248 // It then waits until everything finishes. 249 func (reb *Reb) run(rargs *rebArgs) error { 250 // 6. Capture stats, start mpath joggers 251 reb.stages.stage.Store(rebStageTraverse) 252 253 // No EC-enabled buckets - run only regular rebalance 254 if !rargs.ecUsed { 255 nlog.Infof("starting g%d", rargs.id) 256 return reb.runNoEC(rargs) 257 } 258 259 // In all other cases run both rebalances simultaneously 260 group := &errgroup.Group{} 261 group.Go(func() error { 262 nlog.Infof("starting non-EC g%d", rargs.id) 263 return reb.runNoEC(rargs) 264 }) 265 group.Go(func() error { 266 nlog.Infof("starting EC g%d", rargs.id) 267 return reb.runEC(rargs) 268 }) 269 return group.Wait() 270 } 271 272 func (reb *Reb) serialize(rargs *rebArgs, logHdr string) bool { 273 // 1. check whether other targets are up and running 274 if errCnt := bcast(rargs, reb.pingTarget); errCnt > 0 { 275 return false 276 } 277 if rargs.smap.Version == 0 { 278 rargs.smap = core.T.Sowner().Get() 279 } 280 // 2. serialize global rebalance and start new xaction - 281 // but only if the one that handles the current version is _not_ already in progress 282 if newerRMD, alreadyRunning := reb.acquire(rargs, logHdr); newerRMD || alreadyRunning { 283 return false 284 } 285 if rargs.smap.Version == 0 { 286 rargs.smap = core.T.Sowner().Get() 287 } 288 rargs.apaths = fs.GetAvail() 289 return true 290 } 291 292 func (reb *Reb) acquire(rargs *rebArgs, logHdr string) (newerRMD, alreadyRunning bool) { 293 var ( 294 total time.Duration 295 sleep = rargs.config.Timeout.CplaneOperation.D() 296 maxTotal = max(20*sleep, 10*time.Second) // time to abort prev. streams 297 maxwt = max(rargs.config.Rebalance.DestRetryTime.D(), 2*maxTotal) 298 errcnt int 299 acquired bool 300 ) 301 for { 302 select { 303 case <-reb.semaCh.TryAcquire(): 304 acquired = true 305 default: 306 runtime.Gosched() 307 } 308 if id := reb.nxtID.Load(); id > rargs.id { 309 nlog.Warningf(fmtpend, logHdr, id) 310 newerRMD = true 311 if acquired { 312 reb.semaCh.Release() 313 } 314 return 315 } 316 if reb.rebID.Load() == rargs.id { 317 if acquired { 318 reb.semaCh.Release() 319 } 320 nlog.Warningf("%s: rebalance[g%d] is already running", logHdr, rargs.id) 321 alreadyRunning = true 322 return 323 } 324 325 if acquired { // ok 326 if errcnt > 1 { 327 nlog.Infof("%s: resolved (%d)", logHdr, errcnt) 328 } 329 return 330 } 331 332 // try to preempt 333 err := reb._preempt(rargs, logHdr, total, maxTotal, errcnt) 334 if err != nil { 335 if total > maxwt { 336 cos.ExitLog(err) 337 } 338 errcnt++ 339 } 340 time.Sleep(sleep) 341 total += sleep 342 } 343 } 344 345 func (reb *Reb) _preempt(rargs *rebArgs, logHdr string, total, maxTotal time.Duration, errcnt int) (err error) { 346 entry := xreg.GetRunning(xreg.Flt{Kind: apc.ActRebalance}) 347 if entry == nil { 348 var ( 349 rebID = reb.RebID() 350 rsmap = reb.smap.Load() 351 rlogHdr = reb.logHdr(rebID, rsmap, true) 352 xreb = reb.xctn() 353 s string 354 ) 355 if xreb != nil { 356 s = ", " + xreb.String() 357 } 358 err = fmt.Errorf("%s: acquire/release asymmetry vs %s%s", logHdr, rlogHdr, s) 359 if errcnt%2 == 1 { 360 nlog.Errorln(err) 361 } 362 return 363 } 364 otherXreb := entry.Get().(*xs.Rebalance) // running or previous 365 otherRebID := otherXreb.RebID() 366 if otherRebID >= rargs.id { 367 return 368 } 369 if !otherXreb.IsAborted() { 370 otherXreb.Abort(cmn.ErrXactRenewAbort) 371 nlog.Warningf("%s: aborting older %s", logHdr, otherXreb) 372 return 373 } 374 if total > maxTotal { 375 err = fmt.Errorf("%s: preempting older %s takes too much time", logHdr, otherXreb) 376 nlog.Errorln(err) 377 if xreb := reb.xctn(); xreb != nil && xreb.ID() == otherXreb.ID() { 378 debug.Assert(reb.dm.GetXact().ID() == otherXreb.ID()) 379 nlog.Warningf("%s: aborting older streams...", logHdr) 380 reb.abortStreams() 381 } 382 } 383 return 384 } 385 386 func (reb *Reb) initRenew(rargs *rebArgs, notif *xact.NotifXact, logHdr string, haveStreams bool) bool { 387 if id := reb.nxtID.Load(); id > rargs.id { 388 nlog.Warningf(fmtpend, logHdr, id) 389 return false 390 } 391 rns := xreg.RenewRebalance(rargs.id) 392 debug.AssertNoErr(rns.Err) 393 if rns.IsRunning() { 394 return false 395 } 396 xctn := rns.Entry.Get() 397 398 notif.Xact = xctn 399 xctn.AddNotif(notif) 400 401 reb.mu.Lock() 402 if id := reb.nxtID.Load(); id > rargs.id { 403 reb.mu.Unlock() 404 nlog.Warningf(fmtpend, logHdr, id) 405 return false 406 } 407 reb.stages.stage.Store(rebStageInit) 408 xreb := xctn.(*xs.Rebalance) 409 reb.setXact(xreb) 410 reb.rebID.Store(rargs.id) 411 412 // check Smap _prior_ to opening streams 413 smap := core.T.Sowner().Get() 414 if smap.Version != rargs.smap.Version { 415 debug.Assert(smap.Version > rargs.smap.Version) 416 nlog.Errorf("Warning %s: %s post-init version change %s => %s", core.T, xreb, rargs.smap, smap) 417 // TODO: handle an unlikely corner case keeping in mind that not every change warants a different rebalance 418 } 419 420 // 3. init streams and data structures 421 if haveStreams { 422 reb.beginStreams(rargs.config) 423 } 424 425 if reb.awaiting.targets == nil { 426 reb.awaiting.targets = make(meta.Nodes, 0, maxWackTargets) 427 } else { 428 reb.awaiting.targets = reb.awaiting.targets[:0] 429 } 430 acks := reb.lomAcks() 431 for i := range len(acks) { // init lom acks 432 acks[i] = &lomAcks{mu: &sync.Mutex{}, q: make(map[string]*core.LOM, 64)} 433 } 434 435 // 4. create persistent mark 436 if fatalErr, writeErr := fs.PersistMarker(fname.RebalanceMarker); fatalErr != nil || writeErr != nil { 437 err := writeErr 438 if fatalErr != nil { 439 err = fatalErr 440 } 441 reb.endStreams(err) 442 xctn.Abort(err) 443 reb.mu.Unlock() 444 nlog.Errorf("FATAL: %v, WRITE: %v", fatalErr, writeErr) 445 return false 446 } 447 448 // 5. ready - can receive objects 449 reb.smap.Store(rargs.smap) 450 reb.stages.cleanup() 451 452 reb.mu.Unlock() 453 nlog.Infof("%s: running %s", reb.logHdr(rargs.id, rargs.smap), reb.xctn()) 454 return true 455 } 456 457 func (reb *Reb) beginStreams(config *cmn.Config) { 458 debug.Assert(reb.stages.stage.Load() == rebStageInit) 459 460 xreb := reb.xctn() 461 reb.dm.SetXact(xreb) 462 reb.dm.Open() 463 pushArgs := bundle.Args{ 464 Net: reb.dm.NetC(), 465 Trname: trnamePsh, 466 Multiplier: config.Rebalance.SbundleMult, 467 Extra: &transport.Extra{SenderID: xreb.ID(), Config: config}, 468 } 469 reb.pushes = bundle.New(transport.NewIntraDataClient(), pushArgs) 470 471 reb.laterx.Store(false) 472 reb.inQueue.Store(0) 473 } 474 475 func (reb *Reb) abortStreams() { 476 reb.dm.Abort() 477 reb.pushes.Abort() 478 } 479 480 func (reb *Reb) endStreams(err error) { 481 if reb.stages.stage.CAS(rebStageFin, rebStageFinStreams) { 482 reb.dm.Close(err) 483 reb.pushes.Close(true) 484 } 485 } 486 487 // when at least one bucket has EC enabled 488 func (reb *Reb) runEC(rargs *rebArgs) error { 489 errCnt := bcast(rargs, reb.rxReady) // ignore timeout 490 xreb := reb.xctn() 491 if err := xreb.AbortErr(); err != nil { 492 logHdr := reb.logHdr(rargs.id, rargs.smap) 493 nlog.Infoln(logHdr, "abort ec rx-ready", err, "num-fail", errCnt) 494 return err 495 } 496 if errCnt > 0 { 497 logHdr := reb.logHdr(rargs.id, rargs.smap) 498 nlog.Errorln(logHdr, "ec rx-ready num-fail", errCnt) // unlikely 499 } 500 501 reb.runECjoggers() 502 503 if err := xreb.AbortErr(); err != nil { 504 logHdr := reb.logHdr(rargs.id, rargs.smap) 505 nlog.Infoln(logHdr, "abort ec-joggers", err) 506 return err 507 } 508 nlog.Infof("[%s] RebalanceEC done", core.T.SID()) 509 return nil 510 } 511 512 // when not a single bucket has EC enabled 513 func (reb *Reb) runNoEC(rargs *rebArgs) error { 514 errCnt := bcast(rargs, reb.rxReady) // ignore timeout 515 xreb := reb.xctn() 516 if err := xreb.AbortErr(); err != nil { 517 logHdr := reb.logHdr(rargs.id, rargs.smap) 518 nlog.Infoln(logHdr, "abort rx-ready", err, "num-fail", errCnt) 519 return err 520 } 521 if errCnt > 0 { 522 logHdr := reb.logHdr(rargs.id, rargs.smap) 523 nlog.Errorln(logHdr, "rx-ready num-fail", errCnt) // unlikely 524 } 525 526 wg := &sync.WaitGroup{} 527 ver := rargs.smap.Version 528 for _, mi := range rargs.apaths { 529 rl := &rebJogger{ 530 joggerBase: joggerBase{m: reb, xreb: reb.xctn(), wg: wg}, 531 smap: rargs.smap, ver: ver, 532 } 533 wg.Add(1) 534 go rl.jog(mi) 535 } 536 wg.Wait() 537 538 if err := xreb.AbortErr(); err != nil { 539 logHdr := reb.logHdr(rargs.id, rargs.smap) 540 nlog.Infoln(logHdr, "abort joggers", err) 541 return err 542 } 543 if cmn.Rom.FastV(4, cos.SmoduleReb) { 544 nlog.Infof("finished rebalance walk (g%d)", rargs.id) 545 } 546 return nil 547 } 548 549 func (reb *Reb) rebWaitAck(rargs *rebArgs) (errCnt int) { 550 var ( 551 cnt int 552 logHdr = reb.logHdr(rargs.id, rargs.smap) 553 sleep = rargs.config.Timeout.CplaneOperation.D() 554 maxwt = rargs.config.Rebalance.DestRetryTime.D() 555 xreb = reb.xctn() 556 smap = rargs.smap 557 ) 558 maxwt += time.Duration(int64(time.Minute) * int64(rargs.smap.CountTargets()/10)) 559 maxwt = min(maxwt, rargs.config.Rebalance.DestRetryTime.D()*2) 560 reb.changeStage(rebStageWaitAck) 561 562 for { 563 curwt := time.Duration(0) 564 // poll for no more than maxwt while keeping track of the cumulative polling time via curwt 565 // (here and elsewhere) 566 for curwt < maxwt { 567 cnt = 0 568 var logged bool 569 for _, lomack := range reb.lomAcks() { 570 lomack.mu.Lock() 571 if l := len(lomack.q); l > 0 { 572 cnt += l 573 if !logged { 574 for _, lom := range lomack.q { 575 tsi, err := smap.HrwHash2T(lom.Digest()) 576 if err == nil { 577 nlog.Infof("waiting for %s ACK from %s", lom, tsi.StringEx()) 578 logged = true 579 break 580 } 581 } 582 } 583 } 584 lomack.mu.Unlock() 585 if err := xreb.AbortErr(); err != nil { 586 nlog.Infof("%s: abort wait-ack (%v)", logHdr, err) 587 return 588 } 589 } 590 if cnt == 0 { 591 nlog.Infof("%s: received all ACKs", logHdr) 592 break 593 } 594 nlog.Warningf("%s: waiting for %d ACKs", logHdr, cnt) 595 if err := xreb.AbortedAfter(sleep); err != nil { 596 nlog.Infof("%s: abort wait-ack (%v)", logHdr, err) 597 return 598 } 599 600 curwt += sleep 601 } 602 if cnt > 0 { 603 nlog.Warningf("%s: timed out waiting for %d ACK%s", logHdr, cnt, cos.Plural(cnt)) 604 } 605 if xreb.IsAborted() { 606 return 607 } 608 609 // NOTE: requires locally migrated objects *not* to be removed at the src 610 aPaths, _ := fs.Get() 611 if len(aPaths) > len(rargs.apaths) { 612 nlog.Warningf("%s: mountpath changes detected (%d, %d)", logHdr, len(aPaths), len(rargs.apaths)) 613 } 614 615 // 8. synchronize 616 nlog.Infof("%s: poll targets for: stage=(%s or %s***)", logHdr, stages[rebStageFin], stages[rebStageWaitAck]) 617 errCnt = bcast(rargs, reb.waitFinExtended) 618 if xreb.IsAborted() { 619 return 620 } 621 622 // 9. retransmit if needed 623 cnt = reb.retransmit(rargs, xreb) 624 if cnt == 0 || reb.xctn().IsAborted() { 625 break 626 } 627 nlog.Warningf("%s: retransmitted %d, more wack...", logHdr, cnt) 628 } 629 630 return 631 } 632 633 func (reb *Reb) retransmit(rargs *rebArgs, xreb *xs.Rebalance) (cnt int) { 634 if reb._aborted(rargs) { 635 return 636 } 637 var ( 638 rj = &rebJogger{joggerBase: joggerBase{ 639 m: reb, xreb: reb.xctn(), 640 wg: &sync.WaitGroup{}, 641 }, smap: rargs.smap} 642 loghdr = reb.logHdr(rargs.id, rargs.smap) 643 ) 644 for _, lomAck := range reb.lomAcks() { 645 lomAck.mu.Lock() 646 for uname, lom := range lomAck.q { 647 if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil { 648 if cos.IsNotExist(err, 0) { 649 if cmn.Rom.FastV(5, cos.SmoduleReb) { 650 nlog.Infoln(loghdr, lom.Cname(), "not found") 651 } 652 } else { 653 err = cmn.NewErrFailedTo(core.T, "load", lom.Cname(), err) 654 rj.xreb.AddErr(err) 655 } 656 delete(lomAck.q, uname) 657 continue 658 } 659 tsi, _ := rargs.smap.HrwHash2T(lom.Digest()) 660 if core.T.HeadObjT2T(lom, tsi) { 661 if cmn.Rom.FastV(4, cos.SmoduleReb) { 662 nlog.Infof("%s: HEAD ok %s at %s", loghdr, lom, tsi.StringEx()) 663 } 664 delete(lomAck.q, uname) 665 continue 666 } 667 // retransmit 668 roc, err := _getReader(lom) 669 if err == nil { 670 err = rj.doSend(lom, tsi, roc) 671 } 672 if err == nil { 673 if cmn.Rom.FastV(4, cos.SmoduleReb) { 674 nlog.Infof("%s: retransmit %s => %s", loghdr, lom, tsi.StringEx()) 675 } 676 cnt++ 677 } else { 678 if cmn.IsErrStreamTerminated(err) { 679 xreb.Abort(err) 680 nlog.Errorf("%s: stream term-ed (%v)", loghdr, err) 681 } else { 682 err = fmt.Errorf("%s: failed to retransmit %s => %s: %w", loghdr, lom, tsi.StringEx(), err) 683 rj.xreb.AddErr(err) 684 } 685 } 686 if reb._aborted(rargs) { 687 lomAck.mu.Unlock() 688 return 0 689 } 690 } 691 lomAck.mu.Unlock() 692 if reb._aborted(rargs) { 693 return 0 694 } 695 } 696 return 697 } 698 699 func (reb *Reb) _aborted(rargs *rebArgs) (yes bool) { 700 yes = reb.xctn().IsAborted() 701 yes = yes || (rargs.smap.Version != core.T.Sowner().Get().Version) 702 return 703 } 704 705 func (reb *Reb) fini(rargs *rebArgs, logHdr string, err error) { 706 var stats core.Stats 707 if cmn.Rom.FastV(4, cos.SmoduleReb) { 708 nlog.Infof("finishing rebalance (reb_args: %s)", reb.logHdr(rargs.id, rargs.smap)) 709 } 710 // prior to closing the streams 711 if q := reb.quiesce(rargs, rargs.config.Transport.QuiesceTime.D(), reb.nodesQuiescent); q != core.QuiAborted { 712 if errM := fs.RemoveMarker(fname.RebalanceMarker); errM == nil { 713 nlog.Infof("%s: %s removed marker ok", core.T, reb.xctn()) 714 } 715 _ = fs.RemoveMarker(fname.NodeRestartedPrev) 716 } 717 reb.endStreams(err) 718 reb.filterGFN.Reset() 719 xreb := reb.xctn() 720 xreb.ToStats(&stats) 721 if stats.Objs > 0 || stats.OutObjs > 0 || stats.InObjs > 0 { 722 s, e := jsoniter.MarshalIndent(&stats, "", " ") 723 debug.AssertNoErr(e) 724 nlog.Infoln(string(s)) 725 } 726 reb.stages.stage.Store(rebStageDone) 727 reb.stages.cleanup() 728 729 reb.unregRecv() 730 reb.semaCh.Release() 731 if !xreb.Finished() { 732 xreb.Finish() 733 } 734 nlog.Infof("%s: done (%s)", logHdr, xreb) 735 } 736 737 ////////////////////////////// 738 // rebJogger: global non-EC // 739 ////////////////////////////// 740 741 func (rj *rebJogger) jog(mi *fs.Mountpath) { 742 // the jogger is running in separate goroutine, so use defer to be 743 // sure that `Done` is called even if the jogger crashes to avoid hang up 744 defer rj.wg.Done() 745 { 746 rj.opts.Mi = mi 747 rj.opts.CTs = []string{fs.ObjectType} 748 rj.opts.Callback = rj.visitObj 749 rj.opts.Sorted = false 750 } 751 bmd := core.T.Bowner().Get() 752 bmd.Range(nil, nil, rj.walkBck) 753 } 754 755 func (rj *rebJogger) walkBck(bck *meta.Bck) bool { 756 rj.opts.Bck.Copy(bck.Bucket()) 757 err := fs.Walk(&rj.opts) 758 if err == nil { 759 return rj.xreb.IsAborted() 760 } 761 if rj.xreb.IsAborted() { 762 nlog.Infoln(rj.xreb.Name(), "aborting traversal") 763 } else { 764 nlog.Errorln(core.T.String(), rj.xreb.Name(), "failed to traverse", err) 765 } 766 return true 767 } 768 769 // send completion 770 func (rj *rebJogger) objSentCallback(hdr *transport.ObjHdr, _ io.ReadCloser, arg any, err error) { 771 rj.m.inQueue.Dec() 772 if err == nil { 773 rj.xreb.OutObjsAdd(1, hdr.ObjAttrs.Size) // NOTE: double-counts retransmissions 774 return 775 } 776 // log err 777 if cmn.Rom.FastV(4, cos.SmoduleReb) || !cos.IsRetriableConnErr(err) { 778 if bundle.IsErrDestinationMissing(err) { 779 nlog.Errorf("%s: %v, %s", rj.xreb.Name(), err, rj.smap) 780 } else { 781 lom, ok := arg.(*core.LOM) 782 debug.Assert(ok) 783 nlog.Errorf("%s: %s failed to send %s: %v", core.T, rj.xreb.Name(), lom, err) 784 } 785 } 786 } 787 788 func (rj *rebJogger) visitObj(fqn string, de fs.DirEntry) error { 789 if err := rj.xreb.AbortErr(); err != nil { 790 nlog.Infoln(rj.xreb.Name(), "rj-walk-visit aborted", err) 791 return err 792 } 793 if de.IsDir() { 794 return nil 795 } 796 lom := core.AllocLOM(fqn) 797 err := rj._lwalk(lom, fqn) 798 if err != nil { 799 core.FreeLOM(lom) 800 if err == cmn.ErrSkip { 801 err = nil 802 } 803 } 804 return err 805 } 806 807 func (rj *rebJogger) _lwalk(lom *core.LOM, fqn string) error { 808 if err := lom.InitFQN(fqn, nil); err != nil { 809 if cmn.IsErrBucketLevel(err) { 810 return err 811 } 812 return cmn.ErrSkip 813 } 814 // skip EC.Enabled bucket - leave the job for EC rebalance 815 if lom.ECEnabled() { 816 return filepath.SkipDir 817 } 818 tsi, err := rj.smap.HrwHash2T(lom.Digest()) 819 if err != nil { 820 return err 821 } 822 if tsi.ID() == core.T.SID() { 823 return cmn.ErrSkip 824 } 825 826 // skip objects that were already sent via GFN (due to probabilistic filtering 827 // false-positives, albeit rare, are still possible) 828 uname := cos.UnsafeB(lom.Uname()) 829 if rj.m.filterGFN.Lookup(uname) { 830 rj.m.filterGFN.Delete(uname) 831 return cmn.ErrSkip 832 } 833 // prepare to send: rlock, load, new roc 834 var roc cos.ReadOpenCloser 835 if roc, err = _getReader(lom); err != nil { 836 return err 837 } 838 839 // transmit (unlock via transport completion => roc.Close) 840 rj.m.addLomAck(lom) 841 if err := rj.doSend(lom, tsi, roc); err != nil { 842 rj.m.delLomAck(lom, 0, false /*free LOM*/) 843 return err 844 } 845 846 return nil 847 } 848 849 // takes rlock and keeps it _iff_ successful 850 func _getReader(lom *core.LOM) (roc cos.ReadOpenCloser, err error) { 851 lom.Lock(false) 852 if err = lom.Load(false /*cache it*/, true /*locked*/); err != nil { 853 lom.Unlock(false) 854 return 855 } 856 if lom.IsCopy() { 857 lom.Unlock(false) 858 err = cmn.ErrSkip 859 return 860 } 861 if lom.Checksum() == nil { 862 if _, err = lom.ComputeSetCksum(); err != nil { 863 lom.Unlock(false) 864 return 865 } 866 } 867 debug.Assert(lom.Checksum() != nil, lom.String()) 868 return lom.NewDeferROC() 869 } 870 871 func (rj *rebJogger) doSend(lom *core.LOM, tsi *meta.Snode, roc cos.ReadOpenCloser) error { 872 var ( 873 ack = regularAck{rebID: rj.m.RebID(), daemonID: core.T.SID()} 874 o = transport.AllocSend() 875 opaque = ack.NewPack() 876 ) 877 o.Hdr.Bck.Copy(lom.Bucket()) 878 o.Hdr.ObjName = lom.ObjName 879 o.Hdr.Opaque = opaque 880 o.Hdr.ObjAttrs.CopyFrom(lom.ObjAttrs(), false /*skip cksum*/) 881 o.Callback, o.CmplArg = rj.objSentCallback, lom 882 rj.m.inQueue.Inc() 883 return rj.m.dm.Send(o, roc, tsi) 884 }