github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3em/agent/heartbeater.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package agent
    22  
    23  import (
    24  	"context"
    25  	"fmt"
    26  	"sync"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	hb "github.com/m3db/m3/src/m3em/generated/proto/heartbeat"
    31  	xclock "github.com/m3db/m3/src/x/clock"
    32  	"github.com/m3db/m3/src/x/instrument"
    33  
    34  	"go.uber.org/zap"
    35  	"google.golang.org/grpc"
    36  )
    37  
    38  type runner interface {
    39  	Running() bool
    40  }
    41  
    42  type timeoutFn func(time.Time)
    43  type errorFn func(err error)
    44  
    45  type heartbeatOpts struct {
    46  	operatorUUID string
    47  	endpoint     string
    48  	nowFn        xclock.NowFn
    49  	timeout      time.Duration
    50  	timeoutFn    timeoutFn
    51  	errorFn      errorFn
    52  }
    53  
    54  type heatbeater struct {
    55  	sync.RWMutex
    56  	iopts           instrument.Options
    57  	runner          runner
    58  	opts            heartbeatOpts
    59  	running         int32
    60  	closed          int32
    61  	msgChan         chan heartbeatMsg
    62  	doneCh          chan struct{}
    63  	conn            *grpc.ClientConn
    64  	client          hb.HeartbeaterClient
    65  	lastHeartbeatTs time.Time
    66  }
    67  
    68  func newHeartbeater(
    69  	agent runner,
    70  	opts heartbeatOpts,
    71  	iopts instrument.Options,
    72  ) (*heatbeater, error) {
    73  	conn, err := grpc.Dial(opts.endpoint,
    74  		grpc.WithInsecure(),
    75  		grpc.WithTimeout(opts.timeout))
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  	client := hb.NewHeartbeaterClient(conn)
    80  	return &heatbeater{
    81  		runner:  agent,
    82  		opts:    opts,
    83  		iopts:   iopts,
    84  		msgChan: make(chan heartbeatMsg, 10),
    85  		doneCh:  make(chan struct{}, 1),
    86  		conn:    conn,
    87  		client:  client,
    88  	}, nil
    89  }
    90  
    91  func (h *heatbeater) isRunning() bool {
    92  	return atomic.LoadInt32(&h.running) != 0
    93  }
    94  
    95  func (h *heatbeater) isClosed() bool {
    96  	return atomic.LoadInt32(&h.closed) != 0
    97  }
    98  
    99  func (h *heatbeater) defaultHeartbeat() hb.HeartbeatRequest {
   100  	h.RLock()
   101  	req := hb.HeartbeatRequest{
   102  		OperatorUuid:   h.opts.operatorUUID,
   103  		ProcessRunning: h.runner.Running(),
   104  	}
   105  	h.RUnlock()
   106  	return req
   107  }
   108  
   109  func (h *heatbeater) start(d time.Duration) error {
   110  	if h.isRunning() {
   111  		return fmt.Errorf("heartbeater already running")
   112  	}
   113  	atomic.StoreInt32(&h.running, 1)
   114  	go h.heartbeatLoop(d)
   115  	return nil
   116  }
   117  
   118  func (h *heatbeater) sendHealthyHeartbeat() {
   119  	beat := h.defaultHeartbeat()
   120  	beat.Code = hb.HeartbeatCode_HEALTHY
   121  	h.sendHeartbeat(&beat)
   122  }
   123  
   124  func (h *heatbeater) heartbeatLoop(d time.Duration) {
   125  	defer func() {
   126  		atomic.StoreInt32(&h.running, 0)
   127  		h.doneCh <- struct{}{}
   128  	}()
   129  
   130  	// explicitly send first heartbeat as soon as we start
   131  	h.sendHealthyHeartbeat()
   132  
   133  	for {
   134  		select {
   135  		case msg := <-h.msgChan:
   136  			beat := h.defaultHeartbeat()
   137  			switch {
   138  			case msg.stop:
   139  				return
   140  			case msg.processTerminate:
   141  				beat.Code = hb.HeartbeatCode_PROCESS_TERMINATION
   142  				beat.Error = msg.err
   143  			case msg.overwritten:
   144  				beat.Code = hb.HeartbeatCode_OVERWRITTEN
   145  				beat.Error = msg.err
   146  			default:
   147  				h.opts.errorFn(fmt.Errorf(
   148  					"invalid heartbeatMsg received, one of stop|processTerminate|overwritten must be set. Received: %+v", msg))
   149  				return
   150  			}
   151  			h.sendHeartbeat(&beat)
   152  
   153  		default:
   154  			h.sendHealthyHeartbeat()
   155  			// NB(prateek): we use a sleep instead of a ticker because we've observed emperically
   156  			// the former reschedules go-routines with lower delays.
   157  			time.Sleep(d)
   158  		}
   159  	}
   160  }
   161  
   162  func (h *heatbeater) sendHeartbeat(r *hb.HeartbeatRequest) {
   163  	h.Lock()
   164  	defer h.Unlock()
   165  
   166  	// mark the first send attempt as success to get a base time to compare against
   167  	if h.lastHeartbeatTs.IsZero() {
   168  		h.lastHeartbeatTs = h.opts.nowFn()
   169  	}
   170  
   171  	logger := h.iopts.Logger()
   172  	_, err := h.client.Heartbeat(context.Background(), r)
   173  
   174  	// sent heartbeat successfully, break out
   175  	if err == nil {
   176  		return
   177  	}
   178  
   179  	// unable to send heartbeat
   180  	logger.Warn("unable to send heartbeat", zap.Error(err))
   181  	// check if this has been happening past the permitted period
   182  	var (
   183  		timeSinceLastSend = h.opts.nowFn().Sub(h.lastHeartbeatTs)
   184  		timeout           = h.opts.timeout
   185  	)
   186  	if timeSinceLastSend > timeout {
   187  		logger.Warn("unable to send heartbeats; timing out", zap.Duration("for", timeSinceLastSend))
   188  		go h.opts.timeoutFn(h.lastHeartbeatTs)
   189  	}
   190  }
   191  
   192  func (h *heatbeater) stop() error {
   193  	if !h.isRunning() {
   194  		return fmt.Errorf("heartbeater not running")
   195  	}
   196  
   197  	h.msgChan <- heartbeatMsg{stop: true}
   198  	<-h.doneCh
   199  	return nil
   200  }
   201  
   202  func (h *heatbeater) close() error {
   203  	if h.isClosed() {
   204  		return fmt.Errorf("heartbeater already closed")
   205  	}
   206  
   207  	atomic.StoreInt32(&h.closed, 1)
   208  	h.stop()
   209  	close(h.msgChan)
   210  	close(h.doneCh)
   211  
   212  	var err error
   213  	if h.conn != nil {
   214  		err = h.conn.Close()
   215  		h.conn = nil
   216  	}
   217  	return err
   218  }
   219  
   220  func (h *heatbeater) notifyProcessTermination(reason string) {
   221  	h.msgChan <- heartbeatMsg{
   222  		processTerminate: true,
   223  		err:              reason,
   224  	}
   225  }
   226  
   227  func (h *heatbeater) notifyOverwrite(reason string) {
   228  	h.msgChan <- heartbeatMsg{
   229  		overwritten: true,
   230  		err:         reason,
   231  	}
   232  }
   233  
   234  type heartbeatMsg struct {
   235  	stop             bool
   236  	processTerminate bool
   237  	overwritten      bool
   238  	err              string
   239  }