github.com/m3db/m3@v1.5.0/src/cluster/services/heartbeat/etcd/store.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package etcd 22 23 import ( 24 "errors" 25 "fmt" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/m3db/m3/src/cluster/etcd/watchmanager" 31 "github.com/m3db/m3/src/cluster/generated/proto/placementpb" 32 "github.com/m3db/m3/src/cluster/kv" 33 "github.com/m3db/m3/src/cluster/placement" 34 "github.com/m3db/m3/src/cluster/services" 35 "github.com/m3db/m3/src/x/retry" 36 "github.com/m3db/m3/src/x/watch" 37 38 "github.com/golang/protobuf/proto" 39 "github.com/uber-go/tally" 40 clientv3 "go.etcd.io/etcd/client/v3" 41 "go.uber.org/zap" 42 "golang.org/x/net/context" 43 ) 44 45 const ( 46 heartbeatKeyPrefix = "_hb" 47 keySeparator = "/" 48 keyFormat = "%s/%s" 49 ) 50 51 var ( 52 noopCancel func() 53 errNoServiceID = errors.New("ServiceID cannot be empty") 54 ) 55 56 // NewStore creates a heartbeat store based on etcd 57 func NewStore(c *clientv3.Client, opts Options) (services.HeartbeatService, error) { 58 if opts.ServiceID() == nil { 59 return nil, errNoServiceID 60 } 61 62 scope := opts.InstrumentsOptions().MetricsScope() 63 64 store := &client{ 65 cache: newLeaseCache(), 66 watchables: make(map[string]watch.Watchable), 67 opts: opts, 68 sid: opts.ServiceID(), 69 logger: opts.InstrumentsOptions().Logger(), 70 retrier: retry.NewRetrier(opts.RetryOptions()), 71 m: clientMetrics{ 72 etcdGetError: scope.Counter("etcd-get-error"), 73 etcdPutError: scope.Counter("etcd-put-error"), 74 etcdLeaseError: scope.Counter("etcd-lease-error"), 75 }, 76 77 l: c.Lease, 78 kv: c.KV, 79 watcher: c.Watcher, 80 } 81 82 wOpts := watchmanager.NewOptions(). 83 SetClient(c). 84 SetUpdateFn(store.update). 85 SetTickAndStopFn(store.tickAndStop). 86 SetWatchOptions([]clientv3.OpOption{ 87 // WithPrefix so that the watch will receive any changes 88 // from the instances under the service 89 clientv3.WithPrefix(), 90 // periodically (appx every 10 mins) checks for the latest data 91 // with or without any update notification 92 clientv3.WithProgressNotify(), 93 // receive initial notification once the watch channel is created 94 clientv3.WithCreatedNotify(), 95 }). 96 SetWatchChanCheckInterval(opts.WatchChanCheckInterval()). 97 SetWatchChanInitTimeout(opts.WatchChanInitTimeout()). 98 SetWatchChanResetInterval(opts.WatchChanResetInterval()). 99 SetInstrumentsOptions(opts.InstrumentsOptions()) 100 101 wm, err := watchmanager.NewWatchManager(wOpts) 102 if err != nil { 103 return nil, err 104 } 105 106 store.wm = wm 107 108 return store, nil 109 } 110 111 type client struct { 112 sync.RWMutex 113 114 cache *leaseCache 115 watchables map[string]watch.Watchable 116 opts Options 117 sid services.ServiceID 118 logger *zap.Logger 119 retrier retry.Retrier 120 m clientMetrics 121 122 l clientv3.Lease 123 kv clientv3.KV 124 watcher clientv3.Watcher 125 126 wm watchmanager.WatchManager 127 } 128 129 type clientMetrics struct { 130 etcdGetError tally.Counter 131 etcdPutError tally.Counter 132 etcdLeaseError tally.Counter 133 } 134 135 func (c *client) Heartbeat(instance placement.Instance, ttl time.Duration) error { 136 leaseID, ok := c.cache.get(c.sid, instance.ID(), ttl) 137 if ok { 138 ctx, cancel := c.context() 139 defer cancel() 140 141 _, err := c.l.KeepAliveOnce(ctx, leaseID) 142 // if err != nil, it could because the old lease has already timedout 143 // on the server side, we need to try a new lease. 144 if err == nil { 145 return nil 146 } 147 } 148 149 ctx, cancel := c.context() 150 defer cancel() 151 152 resp, err := c.l.Grant(ctx, int64(ttl/time.Second)) 153 if err != nil { 154 c.m.etcdLeaseError.Inc(1) 155 return err 156 } 157 158 ctx, cancel = c.context() 159 defer cancel() 160 161 instanceProto, err := instance.Proto() 162 if err != nil { 163 return err 164 } 165 166 instanceBytes, err := proto.Marshal(instanceProto) 167 if err != nil { 168 return err 169 } 170 171 _, err = c.kv.Put( 172 ctx, 173 heartbeatKey(c.sid, instance.ID()), 174 string(instanceBytes), 175 clientv3.WithLease(resp.ID), 176 ) 177 if err != nil { 178 c.m.etcdPutError.Inc(1) 179 return err 180 } 181 182 c.cache.put(c.sid, instance.ID(), ttl, resp.ID) 183 184 return nil 185 } 186 187 func (c *client) Get() ([]string, error) { 188 return c.get(servicePrefix(c.sid)) 189 } 190 191 func (c *client) get(key string) ([]string, error) { 192 ctx, cancel := c.context() 193 defer cancel() 194 195 resp, err := c.kv.Get( 196 ctx, 197 key, 198 clientv3.WithPrefix(), 199 clientv3.WithKeysOnly(), 200 ) 201 202 if err != nil { 203 c.m.etcdGetError.Inc(1) 204 return nil, err 205 } 206 207 r := make([]string, len(resp.Kvs)) 208 for i, kv := range resp.Kvs { 209 r[i] = instanceFromKey(string(kv.Key), key) 210 } 211 212 return r, nil 213 } 214 215 func (c *client) GetInstances() ([]placement.Instance, error) { 216 return c.getInstances(servicePrefix(c.sid)) 217 } 218 219 func (c *client) getInstances(key string) ([]placement.Instance, error) { 220 ctx, cancel := c.context() 221 defer cancel() 222 223 gr, err := c.kv.Get(ctx, key, clientv3.WithPrefix()) 224 if err != nil { 225 c.m.etcdGetError.Inc(1) 226 return nil, err 227 } 228 229 r := make([]placement.Instance, len(gr.Kvs)) 230 for i, kv := range gr.Kvs { 231 var p placementpb.Instance 232 if err := proto.Unmarshal(kv.Value, &p); err != nil { 233 return nil, err 234 } 235 236 pi, err := placement.NewInstanceFromProto(&p) 237 if err != nil { 238 return nil, err 239 } 240 241 r[i] = pi 242 } 243 return r, nil 244 } 245 246 func (c *client) Delete(instance string) error { 247 ctx, cancel := c.context() 248 defer cancel() 249 250 r, err := c.kv.Delete(ctx, heartbeatKey(c.sid, instance)) 251 if err != nil { 252 return err 253 } 254 255 if r.Deleted == 0 { 256 return fmt.Errorf("could not find heartbeat for service: %s, env: %s, instance: %s", c.sid.Name(), c.sid.Environment(), instance) 257 } 258 259 // NB(cw) we need to clean up cached lease ID, if not the next heartbeat might reuse the cached lease 260 // and keep alive on existing lease wont work since the key is deleted 261 c.cache.delete(c.sid, instance) 262 return nil 263 } 264 265 func (c *client) Watch() (watch.Watch, error) { 266 serviceKey := servicePrefix(c.sid) 267 268 c.Lock() 269 watchable, ok := c.watchables[serviceKey] 270 if !ok { 271 watchable = watch.NewWatchable() 272 c.watchables[serviceKey] = watchable 273 274 go c.wm.Watch(serviceKey) 275 } 276 c.Unlock() 277 278 _, w, err := watchable.Watch() 279 return w, err 280 } 281 282 func (c *client) update(key string, _ []*clientv3.Event) error { 283 var ( 284 newValue []string 285 err error 286 ) 287 // we need retry here because if Get() failed on an watch update, 288 // it has to wait 10 mins to be notified to try again 289 if execErr := c.retrier.Attempt(func() error { 290 newValue, err = c.get(key) 291 if err == kv.ErrNotFound { 292 // do not retry on ErrNotFound 293 return retry.NonRetryableError(err) 294 } 295 return err 296 }); execErr != nil { 297 return execErr 298 } 299 300 c.RLock() 301 w, ok := c.watchables[key] 302 c.RUnlock() 303 if !ok { 304 return fmt.Errorf("unexpected: no watchable found for key: %s", key) 305 } 306 w.Update(newValue) 307 308 return nil 309 } 310 311 func (c *client) tickAndStop(key string) bool { 312 // fast path 313 c.RLock() 314 watchable, ok := c.watchables[key] 315 c.RUnlock() 316 if !ok { 317 c.logger.Warn("unexpected: key is already cleaned up", zap.String("key", key)) 318 return true 319 } 320 321 if watchable.NumWatches() != 0 { 322 return false 323 } 324 325 // slow path 326 c.Lock() 327 defer c.Unlock() 328 watchable, ok = c.watchables[key] 329 if !ok { 330 // not expect this to happen 331 c.logger.Warn("unexpected: key is already cleaned up", zap.String("key", key)) 332 return true 333 } 334 335 if watchable.NumWatches() != 0 { 336 // a new watch has subscribed to the watchable, do not clean up 337 return false 338 } 339 340 watchable.Close() 341 delete(c.watchables, key) 342 return true 343 } 344 345 func (c *client) context() (context.Context, context.CancelFunc) { 346 ctx := context.Background() 347 cancel := noopCancel 348 if c.opts.RequestTimeout() > 0 { 349 ctx, cancel = context.WithTimeout(ctx, c.opts.RequestTimeout()) 350 } 351 352 return ctx, cancel 353 } 354 355 func heartbeatKey(sid services.ServiceID, instance string) string { 356 return fmt.Sprintf(keyFormat, servicePrefix(sid), instance) 357 } 358 359 func instanceFromKey(key, servicePrefix string) string { 360 return strings.TrimPrefix( 361 strings.TrimPrefix(key, servicePrefix), 362 keySeparator, 363 ) 364 } 365 366 // heartbeats for a service "svc" in env "test" should be stored under 367 // "_hb/test/svc". A service "svc" with no environment will be stored under 368 // "_hb/svc". 369 func servicePrefix(sid services.ServiceID) string { 370 env := sid.Environment() 371 if env == "" { 372 return fmt.Sprintf(keyFormat, heartbeatKeyPrefix, sid.Name()) 373 } 374 375 return fmt.Sprintf( 376 keyFormat, 377 heartbeatKeyPrefix, 378 fmt.Sprintf(keyFormat, env, sid.Name())) 379 } 380 381 func newLeaseCache() *leaseCache { 382 return &leaseCache{ 383 leases: make(map[string]map[time.Duration]clientv3.LeaseID), 384 } 385 } 386 387 type leaseCache struct { 388 sync.RWMutex 389 390 leases map[string]map[time.Duration]clientv3.LeaseID 391 } 392 393 func (c *leaseCache) get(sid services.ServiceID, instance string, ttl time.Duration) (clientv3.LeaseID, bool) { 394 c.RLock() 395 defer c.RUnlock() 396 397 leases, ok := c.leases[heartbeatKey(sid, instance)] 398 if !ok { 399 return clientv3.LeaseID(0), false 400 } 401 402 id, ok := leases[ttl] 403 return id, ok 404 } 405 406 func (c *leaseCache) put(sid services.ServiceID, instance string, ttl time.Duration, id clientv3.LeaseID) { 407 key := heartbeatKey(sid, instance) 408 409 c.Lock() 410 defer c.Unlock() 411 412 leases, ok := c.leases[key] 413 if !ok { 414 leases = make(map[time.Duration]clientv3.LeaseID) 415 c.leases[key] = leases 416 } 417 leases[ttl] = id 418 } 419 420 func (c *leaseCache) delete(sid services.ServiceID, instance string) { 421 c.Lock() 422 delete(c.leases, heartbeatKey(sid, instance)) 423 c.Unlock() 424 }