github.com/m3db/m3@v1.5.0/src/m3em/agent/heartbeater.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package agent 22 23 import ( 24 "context" 25 "fmt" 26 "sync" 27 "sync/atomic" 28 "time" 29 30 hb "github.com/m3db/m3/src/m3em/generated/proto/heartbeat" 31 xclock "github.com/m3db/m3/src/x/clock" 32 "github.com/m3db/m3/src/x/instrument" 33 34 "go.uber.org/zap" 35 "google.golang.org/grpc" 36 ) 37 38 type runner interface { 39 Running() bool 40 } 41 42 type timeoutFn func(time.Time) 43 type errorFn func(err error) 44 45 type heartbeatOpts struct { 46 operatorUUID string 47 endpoint string 48 nowFn xclock.NowFn 49 timeout time.Duration 50 timeoutFn timeoutFn 51 errorFn errorFn 52 } 53 54 type heatbeater struct { 55 sync.RWMutex 56 iopts instrument.Options 57 runner runner 58 opts heartbeatOpts 59 running int32 60 closed int32 61 msgChan chan heartbeatMsg 62 doneCh chan struct{} 63 conn *grpc.ClientConn 64 client hb.HeartbeaterClient 65 lastHeartbeatTs time.Time 66 } 67 68 func newHeartbeater( 69 agent runner, 70 opts heartbeatOpts, 71 iopts instrument.Options, 72 ) (*heatbeater, error) { 73 conn, err := grpc.Dial(opts.endpoint, 74 grpc.WithInsecure(), 75 grpc.WithTimeout(opts.timeout)) 76 if err != nil { 77 return nil, err 78 } 79 client := hb.NewHeartbeaterClient(conn) 80 return &heatbeater{ 81 runner: agent, 82 opts: opts, 83 iopts: iopts, 84 msgChan: make(chan heartbeatMsg, 10), 85 doneCh: make(chan struct{}, 1), 86 conn: conn, 87 client: client, 88 }, nil 89 } 90 91 func (h *heatbeater) isRunning() bool { 92 return atomic.LoadInt32(&h.running) != 0 93 } 94 95 func (h *heatbeater) isClosed() bool { 96 return atomic.LoadInt32(&h.closed) != 0 97 } 98 99 func (h *heatbeater) defaultHeartbeat() hb.HeartbeatRequest { 100 h.RLock() 101 req := hb.HeartbeatRequest{ 102 OperatorUuid: h.opts.operatorUUID, 103 ProcessRunning: h.runner.Running(), 104 } 105 h.RUnlock() 106 return req 107 } 108 109 func (h *heatbeater) start(d time.Duration) error { 110 if h.isRunning() { 111 return fmt.Errorf("heartbeater already running") 112 } 113 atomic.StoreInt32(&h.running, 1) 114 go h.heartbeatLoop(d) 115 return nil 116 } 117 118 func (h *heatbeater) sendHealthyHeartbeat() { 119 beat := h.defaultHeartbeat() 120 beat.Code = hb.HeartbeatCode_HEALTHY 121 h.sendHeartbeat(&beat) 122 } 123 124 func (h *heatbeater) heartbeatLoop(d time.Duration) { 125 defer func() { 126 atomic.StoreInt32(&h.running, 0) 127 h.doneCh <- struct{}{} 128 }() 129 130 // explicitly send first heartbeat as soon as we start 131 h.sendHealthyHeartbeat() 132 133 for { 134 select { 135 case msg := <-h.msgChan: 136 beat := h.defaultHeartbeat() 137 switch { 138 case msg.stop: 139 return 140 case msg.processTerminate: 141 beat.Code = hb.HeartbeatCode_PROCESS_TERMINATION 142 beat.Error = msg.err 143 case msg.overwritten: 144 beat.Code = hb.HeartbeatCode_OVERWRITTEN 145 beat.Error = msg.err 146 default: 147 h.opts.errorFn(fmt.Errorf( 148 "invalid heartbeatMsg received, one of stop|processTerminate|overwritten must be set. Received: %+v", msg)) 149 return 150 } 151 h.sendHeartbeat(&beat) 152 153 default: 154 h.sendHealthyHeartbeat() 155 // NB(prateek): we use a sleep instead of a ticker because we've observed emperically 156 // the former reschedules go-routines with lower delays. 157 time.Sleep(d) 158 } 159 } 160 } 161 162 func (h *heatbeater) sendHeartbeat(r *hb.HeartbeatRequest) { 163 h.Lock() 164 defer h.Unlock() 165 166 // mark the first send attempt as success to get a base time to compare against 167 if h.lastHeartbeatTs.IsZero() { 168 h.lastHeartbeatTs = h.opts.nowFn() 169 } 170 171 logger := h.iopts.Logger() 172 _, err := h.client.Heartbeat(context.Background(), r) 173 174 // sent heartbeat successfully, break out 175 if err == nil { 176 return 177 } 178 179 // unable to send heartbeat 180 logger.Warn("unable to send heartbeat", zap.Error(err)) 181 // check if this has been happening past the permitted period 182 var ( 183 timeSinceLastSend = h.opts.nowFn().Sub(h.lastHeartbeatTs) 184 timeout = h.opts.timeout 185 ) 186 if timeSinceLastSend > timeout { 187 logger.Warn("unable to send heartbeats; timing out", zap.Duration("for", timeSinceLastSend)) 188 go h.opts.timeoutFn(h.lastHeartbeatTs) 189 } 190 } 191 192 func (h *heatbeater) stop() error { 193 if !h.isRunning() { 194 return fmt.Errorf("heartbeater not running") 195 } 196 197 h.msgChan <- heartbeatMsg{stop: true} 198 <-h.doneCh 199 return nil 200 } 201 202 func (h *heatbeater) close() error { 203 if h.isClosed() { 204 return fmt.Errorf("heartbeater already closed") 205 } 206 207 atomic.StoreInt32(&h.closed, 1) 208 h.stop() 209 close(h.msgChan) 210 close(h.doneCh) 211 212 var err error 213 if h.conn != nil { 214 err = h.conn.Close() 215 h.conn = nil 216 } 217 return err 218 } 219 220 func (h *heatbeater) notifyProcessTermination(reason string) { 221 h.msgChan <- heartbeatMsg{ 222 processTerminate: true, 223 err: reason, 224 } 225 } 226 227 func (h *heatbeater) notifyOverwrite(reason string) { 228 h.msgChan <- heartbeatMsg{ 229 overwritten: true, 230 err: reason, 231 } 232 } 233 234 type heartbeatMsg struct { 235 stop bool 236 processTerminate bool 237 overwritten bool 238 err string 239 }