github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/kalive.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"fmt"
     9  	"sync"
    10  	ratomic "sync/atomic"
    11  	"time"
    12  
    13  	"github.com/NVIDIA/aistore/api/apc"
    14  	"github.com/NVIDIA/aistore/cmn"
    15  	"github.com/NVIDIA/aistore/cmn/atomic"
    16  	"github.com/NVIDIA/aistore/cmn/cos"
    17  	"github.com/NVIDIA/aistore/cmn/debug"
    18  	"github.com/NVIDIA/aistore/cmn/mono"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core/meta"
    21  	"github.com/NVIDIA/aistore/stats"
    22  )
    23  
    24  const (
    25  	kaErrorMsg   = "error"
    26  	kaStopMsg    = "stop"
    27  	kaResumeMsg  = "resume"
    28  	kaSuspendMsg = "suspend"
    29  
    30  	kaNumRetries = 3
    31  )
    32  
    33  const (
    34  	waitSelfJoin = 300 * time.Millisecond
    35  	waitStandby  = 5 * time.Second
    36  )
    37  
    38  type (
    39  	keepaliver interface {
    40  		sendKalive(*smapX, time.Duration, bool) (string, int, error)
    41  		heardFrom(sid string)
    42  		do(config *cmn.Config) (stopped bool)
    43  		timeToPing(sid string) bool
    44  		ctrl(msg string)
    45  		paused() bool
    46  		cfg(config *cmn.Config) *cmn.KeepaliveTrackerConf
    47  		cluUptime(int64) time.Duration
    48  	}
    49  	talive struct {
    50  		t *target
    51  		keepalive
    52  	}
    53  	palive struct {
    54  		p          *proxy
    55  		stoppedCh  chan struct{}
    56  		toRemoveCh chan string
    57  		keepalive
    58  	}
    59  	keepalive struct {
    60  		k            keepaliver
    61  		hb           hbTracker
    62  		statsT       stats.Tracker
    63  		controlCh    chan controlSignal
    64  		startedUp    *atomic.Bool
    65  		name         string
    66  		interval     time.Duration // config.Keepalive.Target.Interval or config.Keepalive.Proxy.Interval (10s)
    67  		inProgress   atomic.Bool
    68  		tickerPaused atomic.Bool
    69  	}
    70  	controlSignal struct {
    71  		err error
    72  		msg string
    73  	}
    74  
    75  	hbTracker interface {
    76  		HeardFrom(id string, now int64) // callback for 'id' to respond
    77  		TimedOut(id string) bool        // true if 'id` didn't keepalive or called (via "heard") within the interval (above)
    78  
    79  		reg(id string)
    80  		set(interval time.Duration) bool
    81  	}
    82  	heartBeat struct {
    83  		last     sync.Map
    84  		interval time.Duration // timeout
    85  	}
    86  )
    87  
    88  // interface guard
    89  var (
    90  	_ cos.Runner = (*talive)(nil)
    91  	_ cos.Runner = (*palive)(nil)
    92  
    93  	_ keepaliver = (*talive)(nil)
    94  	_ keepaliver = (*palive)(nil)
    95  
    96  	_ hbTracker = (*heartBeat)(nil)
    97  )
    98  
    99  ////////////
   100  // talive //
   101  ////////////
   102  
   103  func newTalive(t *target, statsT stats.Tracker, startedUp *atomic.Bool) *talive {
   104  	config := cmn.GCO.Get()
   105  
   106  	tkr := &talive{t: t}
   107  	tkr.keepalive.name = "talive"
   108  	tkr.keepalive.k = tkr
   109  	tkr.statsT = statsT
   110  	tkr.keepalive.startedUp = startedUp
   111  	tkr.hb = newHB(config.Keepalive.Target.Interval.D())
   112  	tkr.controlCh = make(chan controlSignal) // unbuffered on purpose
   113  	tkr.interval = config.Keepalive.Target.Interval.D()
   114  	return tkr
   115  }
   116  
   117  func (tkr *talive) Run() error {
   118  	if stopped := tkr.wait(); stopped {
   119  		return nil
   120  	}
   121  
   122  	tkr.init(tkr.t.owner.smap.get(), tkr.t.SID())
   123  
   124  	nlog.Infof("Starting %s", tkr.Name())
   125  	tkr._run()
   126  	return nil
   127  }
   128  
   129  func (*talive) cfg(config *cmn.Config) *cmn.KeepaliveTrackerConf {
   130  	return &config.Keepalive.Target
   131  }
   132  
   133  func (tkr *talive) cluUptime(now int64) (elapsed time.Duration) {
   134  	if at := tkr.t.startup.cluster.Load(); at > 0 {
   135  		elapsed = time.Duration(now - at)
   136  	}
   137  	return
   138  }
   139  
   140  func (tkr *talive) sendKalive(smap *smapX, timeout time.Duration, fast bool) (string, int, error) {
   141  	if fast {
   142  		interrupted, restarted := tkr.t.interruptedRestarted()
   143  		fast = !interrupted && !restarted
   144  	}
   145  	return tkr.t.sendKalive(smap, tkr.t, timeout, fast)
   146  }
   147  
   148  func (tkr *talive) do(config *cmn.Config) (stopped bool) {
   149  	smap := tkr.t.owner.smap.get()
   150  	if smap == nil || smap.validate() != nil {
   151  		return
   152  	}
   153  	if !tkr.timeToPing(smap.Primary.ID()) { // skip sending keepalive
   154  		return
   155  	}
   156  	if stopped = tkr.keepalive.do(smap, tkr.t.si, config); stopped {
   157  		tkr.t.onPrimaryDown(nil /*proxy*/, "")
   158  	}
   159  	return
   160  }
   161  
   162  ////////////
   163  // palive //
   164  ////////////
   165  
   166  func newPalive(p *proxy, statsT stats.Tracker, startedUp *atomic.Bool) *palive {
   167  	config := cmn.GCO.Get()
   168  
   169  	pkr := &palive{p: p}
   170  	pkr.keepalive.name = "palive"
   171  	pkr.keepalive.k = pkr
   172  	pkr.statsT = statsT
   173  	pkr.keepalive.startedUp = startedUp
   174  	pkr.hb = newHB(config.Keepalive.Proxy.Interval.D())
   175  	pkr.controlCh = make(chan controlSignal) // unbuffered on purpose
   176  	pkr.interval = config.Keepalive.Proxy.Interval.D()
   177  	return pkr
   178  }
   179  
   180  func (pkr *palive) Run() error {
   181  	if stopped := pkr.wait(); stopped {
   182  		return nil
   183  	}
   184  
   185  	pkr.init(pkr.p.owner.smap.get(), pkr.p.SID())
   186  
   187  	nlog.Infof("Starting %s", pkr.Name())
   188  	pkr._run()
   189  	return nil
   190  }
   191  
   192  func (*palive) cfg(config *cmn.Config) *cmn.KeepaliveTrackerConf {
   193  	return &config.Keepalive.Proxy
   194  }
   195  
   196  func (pkr *palive) cluUptime(now int64) (elapsed time.Duration) {
   197  	if at := pkr.p.startup.cluster.Load(); at > 0 {
   198  		elapsed = time.Duration(now - at)
   199  	}
   200  	return
   201  }
   202  
   203  func (pkr *palive) sendKalive(smap *smapX, timeout time.Duration, fast bool) (string, int, error) {
   204  	debug.Assert(!smap.isPrimary(pkr.p.si))
   205  	return pkr.p.htrun.sendKalive(smap, nil /*htext*/, timeout, fast)
   206  }
   207  
   208  func (pkr *palive) do(config *cmn.Config) (stopped bool) {
   209  	smap := pkr.p.owner.smap.get()
   210  	if smap == nil || smap.validate() != nil {
   211  		return
   212  	}
   213  	if smap.isPrimary(pkr.p.si) {
   214  		if !pkr.inProgress.CAS(false, true) {
   215  			nlog.Infoln(pkr.p.String() + ": primary keepalive in progress")
   216  			return
   217  		}
   218  		stopped = pkr.updateSmap(config)
   219  		pkr.inProgress.Store(false)
   220  		return
   221  	}
   222  	if !pkr.timeToPing(smap.Primary.ID()) { // skip sending keepalive
   223  		return
   224  	}
   225  	if stopped = pkr.keepalive.do(smap, pkr.p.si, config); stopped {
   226  		pkr.p.onPrimaryDown(pkr.p /*self*/, "")
   227  	}
   228  	return
   229  }
   230  
   231  // updateSmap pings all nodes in parallel. Non-responding nodes get removed from the Smap and
   232  // the resulting map is then metasync-ed.
   233  func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) {
   234  	var (
   235  		p    = pkr.p
   236  		smap = p.owner.smap.get()
   237  		cnt  = smap.Count()
   238  	)
   239  	pkr.openCh(cnt)
   240  	wg := cos.NewLimitedWaitGroup(cmn.MaxParallelism(), cnt) // limit parallelism
   241  	for _, nm := range []meta.NodeMap{smap.Tmap, smap.Pmap} {
   242  		for sid, si := range nm {
   243  			if sid == p.SID() {
   244  				continue
   245  			}
   246  			// skipping
   247  			if !pkr.timeToPing(sid) {
   248  				continue
   249  			}
   250  			// in re maintenance-mode nodes:
   251  			// for future activation, passively (ie, no keepalives) keeping them in the cluster map -
   252  			// use apc.ActRmNodeUnsafe to remove, if need be
   253  			if si.InMaintOrDecomm() {
   254  				continue
   255  			}
   256  
   257  			// direct call first
   258  			started := mono.NanoTime()
   259  			if _, _, err := pkr.p.reqHealth(si, config.Timeout.CplaneOperation.D(), nil, smap); err == nil {
   260  				now := mono.NanoTime()
   261  				pkr.statsT.Add(stats.KeepAliveLatency, now-started)
   262  				pkr.hb.HeardFrom(si.ID(), now) // effectively, yes
   263  				continue
   264  			}
   265  			// otherwise, go keepalive with retries
   266  			wg.Add(1)
   267  			go pkr.ping(si, wg, smap, config)
   268  		}
   269  	}
   270  	wg.Wait()
   271  	if stopped = len(pkr.stoppedCh) > 0; stopped {
   272  		pkr.closeCh()
   273  		return
   274  	}
   275  	if len(pkr.toRemoveCh) == 0 {
   276  		return
   277  	}
   278  	ctx := &smapModifier{pre: pkr._pre, final: pkr._final}
   279  	err := p.owner.smap.modify(ctx)
   280  	if err != nil {
   281  		if ctx.msg != nil {
   282  			nlog.Errorln("FATAL:", err)
   283  		} else {
   284  			nlog.Warningln(err)
   285  		}
   286  	}
   287  	return
   288  }
   289  
   290  func (pkr *palive) ping(si *meta.Snode, wg cos.WG, smap *smapX, config *cmn.Config) {
   291  	if len(pkr.stoppedCh) > 0 {
   292  		wg.Done()
   293  		return
   294  	}
   295  	ok, stopped := pkr._pingRetry(si, smap, config)
   296  	if stopped {
   297  		pkr.stoppedCh <- struct{}{}
   298  	}
   299  	if !ok {
   300  		pkr.toRemoveCh <- si.ID()
   301  	}
   302  	wg.Done()
   303  }
   304  
   305  func (pkr *palive) _pingRetry(si *meta.Snode, smap *smapX, config *cmn.Config) (ok, stopped bool) {
   306  	var (
   307  		timeout = config.Timeout.CplaneOperation.D()
   308  		started = mono.NanoTime()
   309  	)
   310  	_, status, err := pkr.p.reqHealth(si, timeout, nil, smap)
   311  	if err == nil {
   312  		now := mono.NanoTime()
   313  		pkr.statsT.Add(stats.KeepAliveLatency, now-started)
   314  		pkr.hb.HeardFrom(si.ID(), now) // effectively, yes
   315  		return true, false
   316  	}
   317  
   318  	nlog.Warningf("node %s failed health ping [%v(%d)] - retry with max=%s", si.StringEx(), err, status,
   319  		config.Timeout.MaxKeepalive.String())
   320  	ticker := time.NewTicker(cmn.KeepaliveRetryDuration(config))
   321  	ok, stopped = pkr.retry(si, ticker, config.Timeout.MaxKeepalive.D())
   322  	ticker.Stop()
   323  
   324  	return ok, stopped
   325  }
   326  
   327  func (pkr *palive) openCh(daemonCnt int) {
   328  	if pkr.stoppedCh == nil || cap(pkr.stoppedCh) < daemonCnt {
   329  		pkr.stoppedCh = make(chan struct{}, daemonCnt*2)
   330  		pkr.toRemoveCh = make(chan string, daemonCnt*2)
   331  	}
   332  	debug.Assert(len(pkr.stoppedCh) == 0)
   333  	debug.Assert(len(pkr.toRemoveCh) == 0)
   334  }
   335  
   336  func (pkr *palive) closeCh() {
   337  	close(pkr.stoppedCh)
   338  	close(pkr.toRemoveCh)
   339  	pkr.stoppedCh, pkr.toRemoveCh = nil, nil
   340  }
   341  
   342  func (pkr *palive) _pre(ctx *smapModifier, clone *smapX) error {
   343  	ctx.smap = pkr.p.owner.smap.get()
   344  	if !ctx.smap.isPrimary(pkr.p.si) {
   345  		return newErrNotPrimary(pkr.p.si, ctx.smap)
   346  	}
   347  	metaction := "keepalive: removing ["
   348  	cnt := 0
   349  loop:
   350  	for {
   351  		select {
   352  		case sid := <-pkr.toRemoveCh:
   353  			metaction += " ["
   354  			if clone.GetProxy(sid) != nil {
   355  				clone.delProxy(sid)
   356  				clone.staffIC()
   357  				metaction += apc.Proxy
   358  				cnt++
   359  			} else if clone.GetTarget(sid) != nil {
   360  				clone.delTarget(sid)
   361  				metaction += apc.Target
   362  				cnt++
   363  			} else {
   364  				metaction += unknownDaemonID
   365  				nlog.Warningf("node %s not present in the %s (old %s)", sid, clone, ctx.smap)
   366  			}
   367  			metaction += ":" + sid + "] "
   368  
   369  			// Remove reverse proxy entry for the node.
   370  			pkr.p.rproxy.nodes.Delete(sid)
   371  		default:
   372  			break loop
   373  		}
   374  	}
   375  	metaction += "]"
   376  	if cnt == 0 {
   377  		return fmt.Errorf("%s: nothing to do [%s, %s]", pkr.p.si, ctx.smap.StringEx(), metaction)
   378  	}
   379  	ctx.msg = &apc.ActMsg{Value: metaction}
   380  	return nil
   381  }
   382  
   383  func (pkr *palive) _final(ctx *smapModifier, clone *smapX) {
   384  	msg := pkr.p.newAmsg(ctx.msg, nil)
   385  	debug.Assert(clone._sgl != nil)
   386  	_ = pkr.p.metasyncer.sync(revsPair{clone, msg})
   387  }
   388  
   389  func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, timeout time.Duration) (ok, stopped bool) {
   390  	var i int
   391  	for {
   392  		if !pkr.timeToPing(si.ID()) {
   393  			return true, false
   394  		}
   395  		select {
   396  		case <-ticker.C:
   397  			if !pkr.timeToPing(si.ID()) {
   398  				return true, false // heard from the node, skipping health check
   399  			}
   400  			var (
   401  				started = mono.NanoTime()
   402  				smap    = pkr.p.owner.smap.get()
   403  			)
   404  			_, status, err := pkr.p.reqHealth(si, timeout, nil, smap)
   405  			if err == nil {
   406  				now := mono.NanoTime()
   407  				pkr.statsT.Add(stats.KeepAliveLatency, now-started)
   408  				pkr.hb.HeardFrom(si.ID(), now) // effectively, yes
   409  				return true, false
   410  			}
   411  
   412  			i++
   413  			if i == kaNumRetries {
   414  				nlog.Warningf("Failed after %d attempts - removing %s from %s", i, si.StringEx(), smap)
   415  				return false, false
   416  			}
   417  			if cos.IsUnreachable(err, status) {
   418  				continue
   419  			}
   420  			nlog.Warningf("Unexpected error %v(%d) from %s", err, status, si.StringEx())
   421  		case sig := <-pkr.controlCh:
   422  			if sig.msg == kaStopMsg {
   423  				return false, true
   424  			}
   425  		}
   426  	}
   427  }
   428  
   429  ///////////////
   430  // keepalive //
   431  ///////////////
   432  
   433  func (k *keepalive) Name() string { return k.name }
   434  
   435  func (k *keepalive) heardFrom(sid string) {
   436  	k.hb.HeardFrom(sid, 0 /*now*/)
   437  }
   438  
   439  // wait for stats-runner to set startedUp=true
   440  func (k *keepalive) wait() (stopped bool) {
   441  	var ticker *time.Ticker
   442  	if daemon.cli.target.standby {
   443  		ticker = time.NewTicker(waitStandby)
   444  	} else {
   445  		ticker = time.NewTicker(waitSelfJoin)
   446  	}
   447  	stopped = k._wait(ticker)
   448  	ticker.Stop()
   449  	return
   450  }
   451  
   452  func (k *keepalive) _wait(ticker *time.Ticker) (stopped bool) {
   453  	for {
   454  		select {
   455  		case <-ticker.C:
   456  			if k.startedUp.Load() { // i.e., `statsRunner.startedUp`
   457  				return false
   458  			}
   459  		case sig := <-k.controlCh:
   460  			switch sig.msg {
   461  			case kaStopMsg:
   462  				return true
   463  			default:
   464  			}
   465  		}
   466  	}
   467  }
   468  
   469  // pre-populate hb
   470  func (k *keepalive) init(smap *smapX, self string) {
   471  	for _, nm := range []meta.NodeMap{smap.Pmap, smap.Tmap} {
   472  		for sid := range nm {
   473  			if sid == self {
   474  				continue
   475  			}
   476  			k.hb.reg(sid)
   477  		}
   478  	}
   479  }
   480  
   481  func (k *keepalive) _run() {
   482  	var (
   483  		ticker    = time.NewTicker(k.interval)
   484  		lastCheck int64
   485  	)
   486  	k.tickerPaused.Store(false)
   487  	for {
   488  		select {
   489  		case <-ticker.C:
   490  			lastCheck = mono.NanoTime()
   491  			config := cmn.GCO.Get()
   492  			k.k.do(config)
   493  			k.configUpdate(k.k.cfg(config))
   494  		case sig := <-k.controlCh:
   495  			switch sig.msg {
   496  			case kaResumeMsg:
   497  				if k.tickerPaused.CAS(true, false) {
   498  					ticker.Reset(k.interval)
   499  				}
   500  			case kaSuspendMsg:
   501  				if k.tickerPaused.CAS(false, true) {
   502  					ticker.Stop()
   503  				}
   504  			case kaStopMsg:
   505  				ticker.Stop()
   506  				return
   507  			case kaErrorMsg:
   508  				config := cmn.GCO.Get()
   509  				if mono.Since(lastCheck) >= cmn.KeepaliveRetryDuration(config) {
   510  					lastCheck = mono.NanoTime()
   511  					nlog.Infof("triggered by %v", sig.err)
   512  					if stopped := k.k.do(config); stopped {
   513  						ticker.Stop()
   514  						return
   515  					}
   516  				}
   517  			}
   518  		}
   519  	}
   520  }
   521  
   522  func (k *keepalive) configUpdate(cfg *cmn.KeepaliveTrackerConf) {
   523  	if k.hb.set(cfg.Interval.D()) {
   524  		k.interval = cfg.Interval.D()
   525  	}
   526  }
   527  
   528  // keepalive => primary
   529  // is called by non-primary proxies and all targets
   530  func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped bool) {
   531  	var (
   532  		pid     = smap.Primary.ID()
   533  		timeout = config.Timeout.CplaneOperation.D()
   534  		started = mono.NanoTime()
   535  		fast    bool
   536  	)
   537  	if nlog.Stopping() {
   538  		return
   539  	}
   540  	fast = k.k.cluUptime(started) > max(k.interval<<2, config.Timeout.Startup.D()>>1)
   541  	cpid, status, err := k.k.sendKalive(smap, timeout, fast)
   542  	if err == nil {
   543  		now := mono.NanoTime()
   544  		k.statsT.Add(stats.KeepAliveLatency, now-started)
   545  		k.hb.HeardFrom(pid, now) // effectively, yes
   546  		return
   547  	}
   548  
   549  	debug.Assert(cpid == pid && cpid != si.ID(), pid+", "+cpid+", "+si.ID())
   550  	nlog.Warningf("%s => %s keepalive failed: %v(%d)", si, meta.Pname(pid), err, status)
   551  
   552  	//
   553  	// retry
   554  	//
   555  	var (
   556  		ticker = time.NewTicker(cmn.KeepaliveRetryDuration(config))
   557  		i      int
   558  	)
   559  	defer ticker.Stop()
   560  	for {
   561  		select {
   562  		case <-ticker.C:
   563  			// NOTE: suspecting primary down, not checking k.timeToPing(smap.Primary),
   564  			// and therefore not skipping keepalive req (compare with palive.retry)
   565  			i++
   566  			started := mono.NanoTime()
   567  			pid, status, err = k.k.sendKalive(nil, timeout, false)
   568  			if pid == si.ID() {
   569  				return // elected as primary
   570  			}
   571  			if err == nil {
   572  				now := mono.NanoTime()
   573  				k.statsT.Add(stats.KeepAliveLatency, now-started)
   574  				k.hb.HeardFrom(pid, now) // effectively, yes
   575  				nlog.Infof("%s: OK after %d attempt%s", si, i, cos.Plural(i))
   576  				return
   577  			}
   578  			// repeat up to `kaNumRetries` with the max timeout
   579  			timeout = config.Timeout.MaxKeepalive.D()
   580  
   581  			if i == kaNumRetries {
   582  				nlog.Warningf("%s: failed %d attempts => %s (primary)", si, i, meta.Pname(pid))
   583  				return true
   584  			}
   585  			if cos.IsUnreachable(err, status) {
   586  				continue
   587  			}
   588  			if nlog.Stopping() {
   589  				return true
   590  			}
   591  			err = fmt.Errorf("%s: unexpected response from %s: %v(%d)", si, meta.Pname(pid), err, status)
   592  			debug.AssertNoErr(err)
   593  			nlog.Warningln(err)
   594  		case sig := <-k.controlCh:
   595  			if sig.msg == kaStopMsg {
   596  				return true
   597  			}
   598  		}
   599  	}
   600  }
   601  
   602  func (k *keepalive) timeToPing(sid string) bool {
   603  	return k.hb.TimedOut(sid)
   604  }
   605  
   606  func (k *keepalive) Stop(err error) {
   607  	nlog.Infof("Stopping %s, err: %v", k.Name(), err)
   608  	k.controlCh <- controlSignal{msg: kaStopMsg}
   609  	close(k.controlCh)
   610  }
   611  
   612  func (k *keepalive) ctrl(msg string) {
   613  	nlog.Infof("Sending %q on the control channel", msg)
   614  	k.controlCh <- controlSignal{msg: msg}
   615  }
   616  
   617  func (k *keepalive) paused() bool { return k.tickerPaused.Load() }
   618  
   619  ///////////////
   620  // heartBeat //
   621  ///////////////
   622  
   623  func newHB(interval time.Duration) *heartBeat { return &heartBeat{interval: interval} }
   624  
   625  func (hb *heartBeat) HeardFrom(id string, now int64) {
   626  	var (
   627  		val   *int64
   628  		v, ok = hb.last.Load(id)
   629  	)
   630  	if now == 0 {
   631  		now = mono.NanoTime()
   632  	}
   633  	if ok {
   634  		val = v.(*int64) // almost always
   635  	} else {
   636  		val = new(int64)
   637  		hb.last.Store(id, val)
   638  	}
   639  	ratomic.StoreInt64(val, now)
   640  }
   641  
   642  func (hb *heartBeat) TimedOut(id string) bool {
   643  	v, ok := hb.last.Load(id)
   644  	if !ok {
   645  		return true
   646  	}
   647  	val := v.(*int64)
   648  	tim := ratomic.LoadInt64(val)
   649  
   650  	return mono.Since(tim) > hb.interval
   651  }
   652  
   653  func (hb *heartBeat) reg(id string) { hb.last.Store(id, new(int64)) }
   654  
   655  func (hb *heartBeat) set(interval time.Duration) (changed bool) {
   656  	changed = hb.interval != interval
   657  	hb.interval = interval
   658  	return
   659  }