github.com/m3db/m3@v1.5.0/src/m3em/node/heartbeat.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package node 22 23 import ( 24 "fmt" 25 "sync" 26 "time" 27 28 hb "github.com/m3db/m3/src/m3em/generated/proto/heartbeat" 29 "github.com/m3db/m3/src/x/instrument" 30 31 context "golang.org/x/net/context" 32 "google.golang.org/grpc" 33 "google.golang.org/grpc/codes" 34 ) 35 36 type heartbeatRouter struct { 37 sync.RWMutex 38 endpoint string 39 servers map[string]hb.HeartbeaterServer 40 } 41 42 // NewHeartbeatRouter returns a new heartbeat router 43 func NewHeartbeatRouter(endpoint string) HeartbeatRouter { 44 return &heartbeatRouter{ 45 endpoint: endpoint, 46 servers: make(map[string]hb.HeartbeaterServer), 47 } 48 } 49 50 func (hr *heartbeatRouter) Endpoint() string { 51 return hr.endpoint 52 } 53 54 func (hr *heartbeatRouter) Register(id string, server hb.HeartbeaterServer) error { 55 hr.Lock() 56 defer hr.Unlock() 57 58 if _, ok := hr.servers[id]; ok { 59 return fmt.Errorf("uuid is already registered: %s", id) 60 } 61 62 hr.servers[id] = server 63 return nil 64 } 65 66 func (hr *heartbeatRouter) Deregister(id string) error { 67 hr.Lock() 68 defer hr.Unlock() 69 70 if _, ok := hr.servers[id]; !ok { 71 return fmt.Errorf("unknown uuid: %s", id) 72 } 73 74 delete(hr.servers, id) 75 return nil 76 } 77 78 func (hr *heartbeatRouter) Heartbeat( 79 ctx context.Context, 80 msg *hb.HeartbeatRequest, 81 ) (*hb.HeartbeatResponse, error) { 82 if msg == nil { 83 return nil, grpc.Errorf(codes.InvalidArgument, "received nil heartbeat msg") 84 } 85 86 if msg.OperatorUuid == "" { 87 return nil, grpc.Errorf(codes.InvalidArgument, "received heartbeat msg without operator uuid: %+v", *msg) 88 } 89 90 hr.RLock() 91 defer hr.RUnlock() 92 server, ok := hr.servers[msg.OperatorUuid] 93 if !ok { 94 return nil, grpc.Errorf(codes.InvalidArgument, "received heartbeat msg contains un-registered operator uuid: %+v", *msg) 95 } 96 97 return server.Heartbeat(ctx, msg) 98 } 99 100 // Receive heartbeats from remote agent: this is to ensure capture of asynchronous 101 // error conditions, e.g. when a child process kicked off by the agent dies, the 102 // associated ServiceNode is informed of the crash. 103 type opHeartbeatServer struct { 104 sync.RWMutex 105 opts HeartbeatOptions 106 iopts instrument.Options 107 listeners *listenerGroup 108 lastHeartbeat hb.HeartbeatRequest 109 lastHeartbeatTs time.Time 110 running bool 111 stopChan chan struct{} 112 wg sync.WaitGroup 113 } 114 115 func (h *opHeartbeatServer) Heartbeat( 116 ctx context.Context, 117 msg *hb.HeartbeatRequest, 118 ) (*hb.HeartbeatResponse, error) { 119 nowFn := h.opts.NowFn() 120 121 switch msg.GetCode() { 122 case hb.HeartbeatCode_HEALTHY: 123 h.updateLastHeartbeat(nowFn(), msg) 124 125 case hb.HeartbeatCode_PROCESS_TERMINATION: 126 h.updateLastHeartbeat(nowFn(), msg) 127 h.listeners.notifyTermination(msg.GetError()) 128 129 case hb.HeartbeatCode_OVERWRITTEN: 130 h.listeners.notifyOverwrite(msg.GetError()) 131 h.stop() 132 133 default: 134 return nil, grpc.Errorf(codes.InvalidArgument, "received unknown heartbeat msg: %v", *msg) 135 } 136 137 return &hb.HeartbeatResponse{}, nil 138 } 139 140 func newHeartbeater( 141 lg *listenerGroup, 142 opts HeartbeatOptions, 143 iopts instrument.Options, 144 ) *opHeartbeatServer { 145 h := &opHeartbeatServer{ 146 opts: opts, 147 iopts: iopts, 148 listeners: lg, 149 stopChan: make(chan struct{}, 1), 150 } 151 return h 152 } 153 154 func (h *opHeartbeatServer) start() error { 155 h.Lock() 156 defer h.Unlock() 157 if h.running { 158 return fmt.Errorf("already heartbeating, terminate existing process first") 159 } 160 161 h.running = true 162 h.wg.Add(1) 163 go h.monitorTimeout() 164 return nil 165 } 166 167 func (h *opHeartbeatServer) lastHeartbeatTime() time.Time { 168 h.RLock() 169 defer h.RUnlock() 170 return h.lastHeartbeatTs 171 } 172 173 func (h *opHeartbeatServer) monitorTimeout() { 174 defer h.wg.Done() 175 var ( 176 checkInterval = h.opts.CheckInterval() 177 timeoutInterval = h.opts.Timeout() 178 nowFn = h.opts.NowFn() 179 lastNotificationTs time.Time 180 ) 181 182 for { 183 select { 184 case <-h.stopChan: 185 return 186 default: 187 last := h.lastHeartbeatTime() 188 if !last.IsZero() && nowFn().Sub(last) > timeoutInterval && lastNotificationTs != last { 189 h.listeners.notifyTimeout(last) 190 lastNotificationTs = last 191 } 192 // NB(prateek): we use a sleep instead of a ticker because we've observed emperically 193 // the former reschedules go-routines with lower delays. 194 time.Sleep(checkInterval) 195 } 196 } 197 } 198 199 func (h *opHeartbeatServer) updateLastHeartbeat(ts time.Time, msg *hb.HeartbeatRequest) { 200 h.Lock() 201 h.lastHeartbeat = *msg 202 h.lastHeartbeatTs = ts 203 h.Unlock() 204 } 205 206 func (h *opHeartbeatServer) stop() { 207 h.Lock() 208 if !h.running { 209 h.Unlock() 210 return 211 } 212 213 h.running = false 214 h.stopChan <- struct{}{} 215 h.Unlock() 216 h.wg.Wait() 217 }