github.com/m3db/m3@v1.5.0/src/m3em/node/heartbeat.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package node
    22  
    23  import (
    24  	"fmt"
    25  	"sync"
    26  	"time"
    27  
    28  	hb "github.com/m3db/m3/src/m3em/generated/proto/heartbeat"
    29  	"github.com/m3db/m3/src/x/instrument"
    30  
    31  	context "golang.org/x/net/context"
    32  	"google.golang.org/grpc"
    33  	"google.golang.org/grpc/codes"
    34  )
    35  
    36  type heartbeatRouter struct {
    37  	sync.RWMutex
    38  	endpoint string
    39  	servers  map[string]hb.HeartbeaterServer
    40  }
    41  
    42  // NewHeartbeatRouter returns a new heartbeat router
    43  func NewHeartbeatRouter(endpoint string) HeartbeatRouter {
    44  	return &heartbeatRouter{
    45  		endpoint: endpoint,
    46  		servers:  make(map[string]hb.HeartbeaterServer),
    47  	}
    48  }
    49  
    50  func (hr *heartbeatRouter) Endpoint() string {
    51  	return hr.endpoint
    52  }
    53  
    54  func (hr *heartbeatRouter) Register(id string, server hb.HeartbeaterServer) error {
    55  	hr.Lock()
    56  	defer hr.Unlock()
    57  
    58  	if _, ok := hr.servers[id]; ok {
    59  		return fmt.Errorf("uuid is already registered: %s", id)
    60  	}
    61  
    62  	hr.servers[id] = server
    63  	return nil
    64  }
    65  
    66  func (hr *heartbeatRouter) Deregister(id string) error {
    67  	hr.Lock()
    68  	defer hr.Unlock()
    69  
    70  	if _, ok := hr.servers[id]; !ok {
    71  		return fmt.Errorf("unknown uuid: %s", id)
    72  	}
    73  
    74  	delete(hr.servers, id)
    75  	return nil
    76  }
    77  
    78  func (hr *heartbeatRouter) Heartbeat(
    79  	ctx context.Context,
    80  	msg *hb.HeartbeatRequest,
    81  ) (*hb.HeartbeatResponse, error) {
    82  	if msg == nil {
    83  		return nil, grpc.Errorf(codes.InvalidArgument, "received nil heartbeat msg")
    84  	}
    85  
    86  	if msg.OperatorUuid == "" {
    87  		return nil, grpc.Errorf(codes.InvalidArgument, "received heartbeat msg without operator uuid: %+v", *msg)
    88  	}
    89  
    90  	hr.RLock()
    91  	defer hr.RUnlock()
    92  	server, ok := hr.servers[msg.OperatorUuid]
    93  	if !ok {
    94  		return nil, grpc.Errorf(codes.InvalidArgument, "received heartbeat msg contains un-registered operator uuid: %+v", *msg)
    95  	}
    96  
    97  	return server.Heartbeat(ctx, msg)
    98  }
    99  
   100  // Receive heartbeats from remote agent: this is to ensure capture of asynchronous
   101  // error conditions, e.g. when a child process kicked off by the agent dies, the
   102  // associated ServiceNode is informed of the crash.
   103  type opHeartbeatServer struct {
   104  	sync.RWMutex
   105  	opts            HeartbeatOptions
   106  	iopts           instrument.Options
   107  	listeners       *listenerGroup
   108  	lastHeartbeat   hb.HeartbeatRequest
   109  	lastHeartbeatTs time.Time
   110  	running         bool
   111  	stopChan        chan struct{}
   112  	wg              sync.WaitGroup
   113  }
   114  
   115  func (h *opHeartbeatServer) Heartbeat(
   116  	ctx context.Context,
   117  	msg *hb.HeartbeatRequest,
   118  ) (*hb.HeartbeatResponse, error) {
   119  	nowFn := h.opts.NowFn()
   120  
   121  	switch msg.GetCode() {
   122  	case hb.HeartbeatCode_HEALTHY:
   123  		h.updateLastHeartbeat(nowFn(), msg)
   124  
   125  	case hb.HeartbeatCode_PROCESS_TERMINATION:
   126  		h.updateLastHeartbeat(nowFn(), msg)
   127  		h.listeners.notifyTermination(msg.GetError())
   128  
   129  	case hb.HeartbeatCode_OVERWRITTEN:
   130  		h.listeners.notifyOverwrite(msg.GetError())
   131  		h.stop()
   132  
   133  	default:
   134  		return nil, grpc.Errorf(codes.InvalidArgument, "received unknown heartbeat msg: %v", *msg)
   135  	}
   136  
   137  	return &hb.HeartbeatResponse{}, nil
   138  }
   139  
   140  func newHeartbeater(
   141  	lg *listenerGroup,
   142  	opts HeartbeatOptions,
   143  	iopts instrument.Options,
   144  ) *opHeartbeatServer {
   145  	h := &opHeartbeatServer{
   146  		opts:      opts,
   147  		iopts:     iopts,
   148  		listeners: lg,
   149  		stopChan:  make(chan struct{}, 1),
   150  	}
   151  	return h
   152  }
   153  
   154  func (h *opHeartbeatServer) start() error {
   155  	h.Lock()
   156  	defer h.Unlock()
   157  	if h.running {
   158  		return fmt.Errorf("already heartbeating, terminate existing process first")
   159  	}
   160  
   161  	h.running = true
   162  	h.wg.Add(1)
   163  	go h.monitorTimeout()
   164  	return nil
   165  }
   166  
   167  func (h *opHeartbeatServer) lastHeartbeatTime() time.Time {
   168  	h.RLock()
   169  	defer h.RUnlock()
   170  	return h.lastHeartbeatTs
   171  }
   172  
   173  func (h *opHeartbeatServer) monitorTimeout() {
   174  	defer h.wg.Done()
   175  	var (
   176  		checkInterval      = h.opts.CheckInterval()
   177  		timeoutInterval    = h.opts.Timeout()
   178  		nowFn              = h.opts.NowFn()
   179  		lastNotificationTs time.Time
   180  	)
   181  
   182  	for {
   183  		select {
   184  		case <-h.stopChan:
   185  			return
   186  		default:
   187  			last := h.lastHeartbeatTime()
   188  			if !last.IsZero() && nowFn().Sub(last) > timeoutInterval && lastNotificationTs != last {
   189  				h.listeners.notifyTimeout(last)
   190  				lastNotificationTs = last
   191  			}
   192  			// NB(prateek): we use a sleep instead of a ticker because we've observed emperically
   193  			// the former reschedules go-routines with lower delays.
   194  			time.Sleep(checkInterval)
   195  		}
   196  	}
   197  }
   198  
   199  func (h *opHeartbeatServer) updateLastHeartbeat(ts time.Time, msg *hb.HeartbeatRequest) {
   200  	h.Lock()
   201  	h.lastHeartbeat = *msg
   202  	h.lastHeartbeatTs = ts
   203  	h.Unlock()
   204  }
   205  
   206  func (h *opHeartbeatServer) stop() {
   207  	h.Lock()
   208  	if !h.running {
   209  		h.Unlock()
   210  		return
   211  	}
   212  
   213  	h.running = false
   214  	h.stopChan <- struct{}{}
   215  	h.Unlock()
   216  	h.wg.Wait()
   217  }