github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/kalive.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "fmt" 9 "sync" 10 ratomic "sync/atomic" 11 "time" 12 13 "github.com/NVIDIA/aistore/api/apc" 14 "github.com/NVIDIA/aistore/cmn" 15 "github.com/NVIDIA/aistore/cmn/atomic" 16 "github.com/NVIDIA/aistore/cmn/cos" 17 "github.com/NVIDIA/aistore/cmn/debug" 18 "github.com/NVIDIA/aistore/cmn/mono" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core/meta" 21 "github.com/NVIDIA/aistore/stats" 22 ) 23 24 const ( 25 kaErrorMsg = "error" 26 kaStopMsg = "stop" 27 kaResumeMsg = "resume" 28 kaSuspendMsg = "suspend" 29 30 kaNumRetries = 3 31 ) 32 33 const ( 34 waitSelfJoin = 300 * time.Millisecond 35 waitStandby = 5 * time.Second 36 ) 37 38 type ( 39 keepaliver interface { 40 sendKalive(*smapX, time.Duration, bool) (string, int, error) 41 heardFrom(sid string) 42 do(config *cmn.Config) (stopped bool) 43 timeToPing(sid string) bool 44 ctrl(msg string) 45 paused() bool 46 cfg(config *cmn.Config) *cmn.KeepaliveTrackerConf 47 cluUptime(int64) time.Duration 48 } 49 talive struct { 50 t *target 51 keepalive 52 } 53 palive struct { 54 p *proxy 55 stoppedCh chan struct{} 56 toRemoveCh chan string 57 keepalive 58 } 59 keepalive struct { 60 k keepaliver 61 hb hbTracker 62 statsT stats.Tracker 63 controlCh chan controlSignal 64 startedUp *atomic.Bool 65 name string 66 interval time.Duration // config.Keepalive.Target.Interval or config.Keepalive.Proxy.Interval (10s) 67 inProgress atomic.Bool 68 tickerPaused atomic.Bool 69 } 70 controlSignal struct { 71 err error 72 msg string 73 } 74 75 hbTracker interface { 76 HeardFrom(id string, now int64) // callback for 'id' to respond 77 TimedOut(id string) bool // true if 'id` didn't keepalive or called (via "heard") within the interval (above) 78 79 reg(id string) 80 set(interval time.Duration) bool 81 } 82 heartBeat struct { 83 last sync.Map 84 interval time.Duration // timeout 85 } 86 ) 87 88 // interface guard 89 var ( 90 _ cos.Runner = (*talive)(nil) 91 _ cos.Runner = (*palive)(nil) 92 93 _ keepaliver = (*talive)(nil) 94 _ keepaliver = (*palive)(nil) 95 96 _ hbTracker = (*heartBeat)(nil) 97 ) 98 99 //////////// 100 // talive // 101 //////////// 102 103 func newTalive(t *target, statsT stats.Tracker, startedUp *atomic.Bool) *talive { 104 config := cmn.GCO.Get() 105 106 tkr := &talive{t: t} 107 tkr.keepalive.name = "talive" 108 tkr.keepalive.k = tkr 109 tkr.statsT = statsT 110 tkr.keepalive.startedUp = startedUp 111 tkr.hb = newHB(config.Keepalive.Target.Interval.D()) 112 tkr.controlCh = make(chan controlSignal) // unbuffered on purpose 113 tkr.interval = config.Keepalive.Target.Interval.D() 114 return tkr 115 } 116 117 func (tkr *talive) Run() error { 118 if stopped := tkr.wait(); stopped { 119 return nil 120 } 121 122 tkr.init(tkr.t.owner.smap.get(), tkr.t.SID()) 123 124 nlog.Infof("Starting %s", tkr.Name()) 125 tkr._run() 126 return nil 127 } 128 129 func (*talive) cfg(config *cmn.Config) *cmn.KeepaliveTrackerConf { 130 return &config.Keepalive.Target 131 } 132 133 func (tkr *talive) cluUptime(now int64) (elapsed time.Duration) { 134 if at := tkr.t.startup.cluster.Load(); at > 0 { 135 elapsed = time.Duration(now - at) 136 } 137 return 138 } 139 140 func (tkr *talive) sendKalive(smap *smapX, timeout time.Duration, fast bool) (string, int, error) { 141 if fast { 142 interrupted, restarted := tkr.t.interruptedRestarted() 143 fast = !interrupted && !restarted 144 } 145 return tkr.t.sendKalive(smap, tkr.t, timeout, fast) 146 } 147 148 func (tkr *talive) do(config *cmn.Config) (stopped bool) { 149 smap := tkr.t.owner.smap.get() 150 if smap == nil || smap.validate() != nil { 151 return 152 } 153 if !tkr.timeToPing(smap.Primary.ID()) { // skip sending keepalive 154 return 155 } 156 if stopped = tkr.keepalive.do(smap, tkr.t.si, config); stopped { 157 tkr.t.onPrimaryDown(nil /*proxy*/, "") 158 } 159 return 160 } 161 162 //////////// 163 // palive // 164 //////////// 165 166 func newPalive(p *proxy, statsT stats.Tracker, startedUp *atomic.Bool) *palive { 167 config := cmn.GCO.Get() 168 169 pkr := &palive{p: p} 170 pkr.keepalive.name = "palive" 171 pkr.keepalive.k = pkr 172 pkr.statsT = statsT 173 pkr.keepalive.startedUp = startedUp 174 pkr.hb = newHB(config.Keepalive.Proxy.Interval.D()) 175 pkr.controlCh = make(chan controlSignal) // unbuffered on purpose 176 pkr.interval = config.Keepalive.Proxy.Interval.D() 177 return pkr 178 } 179 180 func (pkr *palive) Run() error { 181 if stopped := pkr.wait(); stopped { 182 return nil 183 } 184 185 pkr.init(pkr.p.owner.smap.get(), pkr.p.SID()) 186 187 nlog.Infof("Starting %s", pkr.Name()) 188 pkr._run() 189 return nil 190 } 191 192 func (*palive) cfg(config *cmn.Config) *cmn.KeepaliveTrackerConf { 193 return &config.Keepalive.Proxy 194 } 195 196 func (pkr *palive) cluUptime(now int64) (elapsed time.Duration) { 197 if at := pkr.p.startup.cluster.Load(); at > 0 { 198 elapsed = time.Duration(now - at) 199 } 200 return 201 } 202 203 func (pkr *palive) sendKalive(smap *smapX, timeout time.Duration, fast bool) (string, int, error) { 204 debug.Assert(!smap.isPrimary(pkr.p.si)) 205 return pkr.p.htrun.sendKalive(smap, nil /*htext*/, timeout, fast) 206 } 207 208 func (pkr *palive) do(config *cmn.Config) (stopped bool) { 209 smap := pkr.p.owner.smap.get() 210 if smap == nil || smap.validate() != nil { 211 return 212 } 213 if smap.isPrimary(pkr.p.si) { 214 if !pkr.inProgress.CAS(false, true) { 215 nlog.Infoln(pkr.p.String() + ": primary keepalive in progress") 216 return 217 } 218 stopped = pkr.updateSmap(config) 219 pkr.inProgress.Store(false) 220 return 221 } 222 if !pkr.timeToPing(smap.Primary.ID()) { // skip sending keepalive 223 return 224 } 225 if stopped = pkr.keepalive.do(smap, pkr.p.si, config); stopped { 226 pkr.p.onPrimaryDown(pkr.p /*self*/, "") 227 } 228 return 229 } 230 231 // updateSmap pings all nodes in parallel. Non-responding nodes get removed from the Smap and 232 // the resulting map is then metasync-ed. 233 func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) { 234 var ( 235 p = pkr.p 236 smap = p.owner.smap.get() 237 cnt = smap.Count() 238 ) 239 pkr.openCh(cnt) 240 wg := cos.NewLimitedWaitGroup(cmn.MaxParallelism(), cnt) // limit parallelism 241 for _, nm := range []meta.NodeMap{smap.Tmap, smap.Pmap} { 242 for sid, si := range nm { 243 if sid == p.SID() { 244 continue 245 } 246 // skipping 247 if !pkr.timeToPing(sid) { 248 continue 249 } 250 // in re maintenance-mode nodes: 251 // for future activation, passively (ie, no keepalives) keeping them in the cluster map - 252 // use apc.ActRmNodeUnsafe to remove, if need be 253 if si.InMaintOrDecomm() { 254 continue 255 } 256 257 // direct call first 258 started := mono.NanoTime() 259 if _, _, err := pkr.p.reqHealth(si, config.Timeout.CplaneOperation.D(), nil, smap); err == nil { 260 now := mono.NanoTime() 261 pkr.statsT.Add(stats.KeepAliveLatency, now-started) 262 pkr.hb.HeardFrom(si.ID(), now) // effectively, yes 263 continue 264 } 265 // otherwise, go keepalive with retries 266 wg.Add(1) 267 go pkr.ping(si, wg, smap, config) 268 } 269 } 270 wg.Wait() 271 if stopped = len(pkr.stoppedCh) > 0; stopped { 272 pkr.closeCh() 273 return 274 } 275 if len(pkr.toRemoveCh) == 0 { 276 return 277 } 278 ctx := &smapModifier{pre: pkr._pre, final: pkr._final} 279 err := p.owner.smap.modify(ctx) 280 if err != nil { 281 if ctx.msg != nil { 282 nlog.Errorln("FATAL:", err) 283 } else { 284 nlog.Warningln(err) 285 } 286 } 287 return 288 } 289 290 func (pkr *palive) ping(si *meta.Snode, wg cos.WG, smap *smapX, config *cmn.Config) { 291 if len(pkr.stoppedCh) > 0 { 292 wg.Done() 293 return 294 } 295 ok, stopped := pkr._pingRetry(si, smap, config) 296 if stopped { 297 pkr.stoppedCh <- struct{}{} 298 } 299 if !ok { 300 pkr.toRemoveCh <- si.ID() 301 } 302 wg.Done() 303 } 304 305 func (pkr *palive) _pingRetry(si *meta.Snode, smap *smapX, config *cmn.Config) (ok, stopped bool) { 306 var ( 307 timeout = config.Timeout.CplaneOperation.D() 308 started = mono.NanoTime() 309 ) 310 _, status, err := pkr.p.reqHealth(si, timeout, nil, smap) 311 if err == nil { 312 now := mono.NanoTime() 313 pkr.statsT.Add(stats.KeepAliveLatency, now-started) 314 pkr.hb.HeardFrom(si.ID(), now) // effectively, yes 315 return true, false 316 } 317 318 nlog.Warningf("node %s failed health ping [%v(%d)] - retry with max=%s", si.StringEx(), err, status, 319 config.Timeout.MaxKeepalive.String()) 320 ticker := time.NewTicker(cmn.KeepaliveRetryDuration(config)) 321 ok, stopped = pkr.retry(si, ticker, config.Timeout.MaxKeepalive.D()) 322 ticker.Stop() 323 324 return ok, stopped 325 } 326 327 func (pkr *palive) openCh(daemonCnt int) { 328 if pkr.stoppedCh == nil || cap(pkr.stoppedCh) < daemonCnt { 329 pkr.stoppedCh = make(chan struct{}, daemonCnt*2) 330 pkr.toRemoveCh = make(chan string, daemonCnt*2) 331 } 332 debug.Assert(len(pkr.stoppedCh) == 0) 333 debug.Assert(len(pkr.toRemoveCh) == 0) 334 } 335 336 func (pkr *palive) closeCh() { 337 close(pkr.stoppedCh) 338 close(pkr.toRemoveCh) 339 pkr.stoppedCh, pkr.toRemoveCh = nil, nil 340 } 341 342 func (pkr *palive) _pre(ctx *smapModifier, clone *smapX) error { 343 ctx.smap = pkr.p.owner.smap.get() 344 if !ctx.smap.isPrimary(pkr.p.si) { 345 return newErrNotPrimary(pkr.p.si, ctx.smap) 346 } 347 metaction := "keepalive: removing [" 348 cnt := 0 349 loop: 350 for { 351 select { 352 case sid := <-pkr.toRemoveCh: 353 metaction += " [" 354 if clone.GetProxy(sid) != nil { 355 clone.delProxy(sid) 356 clone.staffIC() 357 metaction += apc.Proxy 358 cnt++ 359 } else if clone.GetTarget(sid) != nil { 360 clone.delTarget(sid) 361 metaction += apc.Target 362 cnt++ 363 } else { 364 metaction += unknownDaemonID 365 nlog.Warningf("node %s not present in the %s (old %s)", sid, clone, ctx.smap) 366 } 367 metaction += ":" + sid + "] " 368 369 // Remove reverse proxy entry for the node. 370 pkr.p.rproxy.nodes.Delete(sid) 371 default: 372 break loop 373 } 374 } 375 metaction += "]" 376 if cnt == 0 { 377 return fmt.Errorf("%s: nothing to do [%s, %s]", pkr.p.si, ctx.smap.StringEx(), metaction) 378 } 379 ctx.msg = &apc.ActMsg{Value: metaction} 380 return nil 381 } 382 383 func (pkr *palive) _final(ctx *smapModifier, clone *smapX) { 384 msg := pkr.p.newAmsg(ctx.msg, nil) 385 debug.Assert(clone._sgl != nil) 386 _ = pkr.p.metasyncer.sync(revsPair{clone, msg}) 387 } 388 389 func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, timeout time.Duration) (ok, stopped bool) { 390 var i int 391 for { 392 if !pkr.timeToPing(si.ID()) { 393 return true, false 394 } 395 select { 396 case <-ticker.C: 397 if !pkr.timeToPing(si.ID()) { 398 return true, false // heard from the node, skipping health check 399 } 400 var ( 401 started = mono.NanoTime() 402 smap = pkr.p.owner.smap.get() 403 ) 404 _, status, err := pkr.p.reqHealth(si, timeout, nil, smap) 405 if err == nil { 406 now := mono.NanoTime() 407 pkr.statsT.Add(stats.KeepAliveLatency, now-started) 408 pkr.hb.HeardFrom(si.ID(), now) // effectively, yes 409 return true, false 410 } 411 412 i++ 413 if i == kaNumRetries { 414 nlog.Warningf("Failed after %d attempts - removing %s from %s", i, si.StringEx(), smap) 415 return false, false 416 } 417 if cos.IsUnreachable(err, status) { 418 continue 419 } 420 nlog.Warningf("Unexpected error %v(%d) from %s", err, status, si.StringEx()) 421 case sig := <-pkr.controlCh: 422 if sig.msg == kaStopMsg { 423 return false, true 424 } 425 } 426 } 427 } 428 429 /////////////// 430 // keepalive // 431 /////////////// 432 433 func (k *keepalive) Name() string { return k.name } 434 435 func (k *keepalive) heardFrom(sid string) { 436 k.hb.HeardFrom(sid, 0 /*now*/) 437 } 438 439 // wait for stats-runner to set startedUp=true 440 func (k *keepalive) wait() (stopped bool) { 441 var ticker *time.Ticker 442 if daemon.cli.target.standby { 443 ticker = time.NewTicker(waitStandby) 444 } else { 445 ticker = time.NewTicker(waitSelfJoin) 446 } 447 stopped = k._wait(ticker) 448 ticker.Stop() 449 return 450 } 451 452 func (k *keepalive) _wait(ticker *time.Ticker) (stopped bool) { 453 for { 454 select { 455 case <-ticker.C: 456 if k.startedUp.Load() { // i.e., `statsRunner.startedUp` 457 return false 458 } 459 case sig := <-k.controlCh: 460 switch sig.msg { 461 case kaStopMsg: 462 return true 463 default: 464 } 465 } 466 } 467 } 468 469 // pre-populate hb 470 func (k *keepalive) init(smap *smapX, self string) { 471 for _, nm := range []meta.NodeMap{smap.Pmap, smap.Tmap} { 472 for sid := range nm { 473 if sid == self { 474 continue 475 } 476 k.hb.reg(sid) 477 } 478 } 479 } 480 481 func (k *keepalive) _run() { 482 var ( 483 ticker = time.NewTicker(k.interval) 484 lastCheck int64 485 ) 486 k.tickerPaused.Store(false) 487 for { 488 select { 489 case <-ticker.C: 490 lastCheck = mono.NanoTime() 491 config := cmn.GCO.Get() 492 k.k.do(config) 493 k.configUpdate(k.k.cfg(config)) 494 case sig := <-k.controlCh: 495 switch sig.msg { 496 case kaResumeMsg: 497 if k.tickerPaused.CAS(true, false) { 498 ticker.Reset(k.interval) 499 } 500 case kaSuspendMsg: 501 if k.tickerPaused.CAS(false, true) { 502 ticker.Stop() 503 } 504 case kaStopMsg: 505 ticker.Stop() 506 return 507 case kaErrorMsg: 508 config := cmn.GCO.Get() 509 if mono.Since(lastCheck) >= cmn.KeepaliveRetryDuration(config) { 510 lastCheck = mono.NanoTime() 511 nlog.Infof("triggered by %v", sig.err) 512 if stopped := k.k.do(config); stopped { 513 ticker.Stop() 514 return 515 } 516 } 517 } 518 } 519 } 520 } 521 522 func (k *keepalive) configUpdate(cfg *cmn.KeepaliveTrackerConf) { 523 if k.hb.set(cfg.Interval.D()) { 524 k.interval = cfg.Interval.D() 525 } 526 } 527 528 // keepalive => primary 529 // is called by non-primary proxies and all targets 530 func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped bool) { 531 var ( 532 pid = smap.Primary.ID() 533 timeout = config.Timeout.CplaneOperation.D() 534 started = mono.NanoTime() 535 fast bool 536 ) 537 if nlog.Stopping() { 538 return 539 } 540 fast = k.k.cluUptime(started) > max(k.interval<<2, config.Timeout.Startup.D()>>1) 541 cpid, status, err := k.k.sendKalive(smap, timeout, fast) 542 if err == nil { 543 now := mono.NanoTime() 544 k.statsT.Add(stats.KeepAliveLatency, now-started) 545 k.hb.HeardFrom(pid, now) // effectively, yes 546 return 547 } 548 549 debug.Assert(cpid == pid && cpid != si.ID(), pid+", "+cpid+", "+si.ID()) 550 nlog.Warningf("%s => %s keepalive failed: %v(%d)", si, meta.Pname(pid), err, status) 551 552 // 553 // retry 554 // 555 var ( 556 ticker = time.NewTicker(cmn.KeepaliveRetryDuration(config)) 557 i int 558 ) 559 defer ticker.Stop() 560 for { 561 select { 562 case <-ticker.C: 563 // NOTE: suspecting primary down, not checking k.timeToPing(smap.Primary), 564 // and therefore not skipping keepalive req (compare with palive.retry) 565 i++ 566 started := mono.NanoTime() 567 pid, status, err = k.k.sendKalive(nil, timeout, false) 568 if pid == si.ID() { 569 return // elected as primary 570 } 571 if err == nil { 572 now := mono.NanoTime() 573 k.statsT.Add(stats.KeepAliveLatency, now-started) 574 k.hb.HeardFrom(pid, now) // effectively, yes 575 nlog.Infof("%s: OK after %d attempt%s", si, i, cos.Plural(i)) 576 return 577 } 578 // repeat up to `kaNumRetries` with the max timeout 579 timeout = config.Timeout.MaxKeepalive.D() 580 581 if i == kaNumRetries { 582 nlog.Warningf("%s: failed %d attempts => %s (primary)", si, i, meta.Pname(pid)) 583 return true 584 } 585 if cos.IsUnreachable(err, status) { 586 continue 587 } 588 if nlog.Stopping() { 589 return true 590 } 591 err = fmt.Errorf("%s: unexpected response from %s: %v(%d)", si, meta.Pname(pid), err, status) 592 debug.AssertNoErr(err) 593 nlog.Warningln(err) 594 case sig := <-k.controlCh: 595 if sig.msg == kaStopMsg { 596 return true 597 } 598 } 599 } 600 } 601 602 func (k *keepalive) timeToPing(sid string) bool { 603 return k.hb.TimedOut(sid) 604 } 605 606 func (k *keepalive) Stop(err error) { 607 nlog.Infof("Stopping %s, err: %v", k.Name(), err) 608 k.controlCh <- controlSignal{msg: kaStopMsg} 609 close(k.controlCh) 610 } 611 612 func (k *keepalive) ctrl(msg string) { 613 nlog.Infof("Sending %q on the control channel", msg) 614 k.controlCh <- controlSignal{msg: msg} 615 } 616 617 func (k *keepalive) paused() bool { return k.tickerPaused.Load() } 618 619 /////////////// 620 // heartBeat // 621 /////////////// 622 623 func newHB(interval time.Duration) *heartBeat { return &heartBeat{interval: interval} } 624 625 func (hb *heartBeat) HeardFrom(id string, now int64) { 626 var ( 627 val *int64 628 v, ok = hb.last.Load(id) 629 ) 630 if now == 0 { 631 now = mono.NanoTime() 632 } 633 if ok { 634 val = v.(*int64) // almost always 635 } else { 636 val = new(int64) 637 hb.last.Store(id, val) 638 } 639 ratomic.StoreInt64(val, now) 640 } 641 642 func (hb *heartBeat) TimedOut(id string) bool { 643 v, ok := hb.last.Load(id) 644 if !ok { 645 return true 646 } 647 val := v.(*int64) 648 tim := ratomic.LoadInt64(val) 649 650 return mono.Since(tim) > hb.interval 651 } 652 653 func (hb *heartBeat) reg(id string) { hb.last.Store(id, new(int64)) } 654 655 func (hb *heartBeat) set(interval time.Duration) (changed bool) { 656 changed = hb.interval != interval 657 hb.interval = interval 658 return 659 }