github.com/m3db/m3@v1.5.0/src/msg/producer/writer/consumer_writer.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package writer 22 23 import ( 24 "context" 25 "errors" 26 "fmt" 27 "io" 28 "net" 29 "sync" 30 "time" 31 32 "github.com/m3db/m3/src/msg/generated/proto/msgpb" 33 "github.com/m3db/m3/src/msg/protocol/proto" 34 "github.com/m3db/m3/src/x/clock" 35 xio "github.com/m3db/m3/src/x/io" 36 "github.com/m3db/m3/src/x/retry" 37 38 "github.com/uber-go/tally" 39 "go.uber.org/zap" 40 ) 41 42 const ( 43 defaultRetryForever = true 44 ) 45 46 var ( 47 errInvalidConnection = errors.New("connection is invalid") 48 u uninitializedReadWriter 49 ) 50 51 type consumerWriter interface { 52 // Address returns the consumer address. 53 Address() string 54 55 // Write writes the bytes, it is thread safe per connection index. 56 Write(connIndex int, b []byte) error 57 58 // Init initializes the consumer writer. 59 Init() 60 61 // Close closes the consumer writer. 62 Close() 63 } 64 65 type consumerWriterMetrics struct { 66 writeInvalidConn tally.Counter 67 readInvalidConn tally.Counter 68 ackError tally.Counter 69 decodeError tally.Counter 70 encodeError tally.Counter 71 resetTooSoon tally.Counter 72 resetSuccess tally.Counter 73 resetError tally.Counter 74 connectError tally.Counter 75 setKeepAliveError tally.Counter 76 setKeepAlivePeriodError tally.Counter 77 } 78 79 func newConsumerWriterMetrics(scope tally.Scope) consumerWriterMetrics { 80 return consumerWriterMetrics{ 81 writeInvalidConn: scope.Counter("write-invalid-conn"), 82 readInvalidConn: scope.Counter("read-invalid-conn"), 83 ackError: scope.Counter("ack-error"), 84 decodeError: scope.Counter("decode-error"), 85 encodeError: scope.Counter("encode-error"), 86 resetTooSoon: scope.Counter("reset-too-soon"), 87 resetSuccess: scope.Counter("reset-success"), 88 resetError: scope.Counter("reset-error"), 89 connectError: scope.Counter("connect-error"), 90 setKeepAliveError: scope.Counter("set-keep-alive-error"), 91 setKeepAlivePeriodError: scope.Counter("set-keep-alive-period-error"), 92 } 93 } 94 95 type connectFn func(addr string) (io.ReadWriteCloser, error) 96 97 type connectAllFn func(addr string) ([]io.ReadWriteCloser, error) 98 99 type consumerWriterImpl struct { 100 writeState consumerWriterImplWriteState 101 102 addr string 103 router ackRouter 104 opts Options 105 connOpts ConnectionOptions 106 ackRetrier retry.Retrier 107 connRetrier retry.Retrier 108 logger *zap.Logger 109 110 resetCh chan struct{} 111 doneCh chan struct{} 112 wg sync.WaitGroup 113 m consumerWriterMetrics 114 115 nowFn clock.NowFn 116 connectFn connectFn 117 } 118 119 type consumerWriterImplWriteState struct { 120 sync.RWMutex 121 122 closed bool 123 validConns bool 124 125 // conns keeps active connections. 126 // Note: readers will take a reference to this slice with a lock 127 // then loop through it and call decode on decoders, so not safe 128 // to reuse. 129 conns []*connection 130 lastResetNanos int64 131 } 132 133 type connection struct { 134 writeLock sync.Mutex 135 conn io.ReadWriteCloser 136 w xio.ResettableWriter 137 decoder proto.Decoder 138 ack msgpb.Ack 139 } 140 141 func newConsumerWriter( 142 addr string, 143 router ackRouter, 144 opts Options, 145 m consumerWriterMetrics, 146 ) consumerWriter { 147 if opts == nil { 148 opts = NewOptions() 149 } 150 151 connOpts := opts.ConnectionOptions() 152 w := &consumerWriterImpl{ 153 addr: addr, 154 router: router, 155 opts: opts, 156 connOpts: connOpts, 157 ackRetrier: retry.NewRetrier(opts.AckErrorRetryOptions()), 158 connRetrier: retry.NewRetrier(connOpts.RetryOptions().SetForever(defaultRetryForever)), 159 logger: opts.InstrumentOptions().Logger(), 160 resetCh: make(chan struct{}, 1), 161 doneCh: make(chan struct{}), 162 m: m, 163 nowFn: time.Now, 164 } 165 w.connectFn = w.connectNoRetry 166 167 // Initialize no-op connections since it's valid even if connecting the 168 // first time fails to continue to try to write to the writer. 169 // Note: Also tests try to break a non-connected writer. 170 conns := make([]io.ReadWriteCloser, 0, connOpts.NumConnections()) 171 for i := 0; i < connOpts.NumConnections(); i++ { 172 conns = append(conns, u) 173 } 174 // NB(r): Reset at epoch since a connection failure should trigger 175 // an immediate reset after first connection attempt (if write fails 176 // since first connection is with no retry). 177 w.reset(resetOptions{ 178 connections: conns, 179 at: time.Time{}, 180 validConns: false, 181 }) 182 183 // Try connecting without retry first attempt. 184 connectAllNoRetry := w.newConnectFn(connectOptions{retry: false}) 185 if err := w.resetWithConnectFn(connectAllNoRetry); err != nil { 186 w.notifyReset(err) 187 } 188 return w 189 } 190 191 func (w *consumerWriterImpl) Address() string { 192 return w.addr 193 } 194 195 // Write should fail fast so that the write could be tried on other 196 // consumer writers that are sharing the message queue. 197 func (w *consumerWriterImpl) Write(connIndex int, b []byte) error { 198 w.writeState.RLock() 199 if !w.writeState.validConns || len(w.writeState.conns) == 0 { 200 w.writeState.RUnlock() 201 w.m.writeInvalidConn.Inc(1) 202 return errInvalidConnection 203 } 204 if connIndex < 0 || connIndex >= len(w.writeState.conns) { 205 w.writeState.RUnlock() 206 return fmt.Errorf("connection index out of range: %d", connIndex) 207 } 208 209 writeConn := w.writeState.conns[connIndex] 210 211 // Make sure only writer to this connection. 212 writeConn.writeLock.Lock() 213 _, err := writeConn.w.Write(b) 214 writeConn.writeLock.Unlock() 215 216 // Hold onto the write state lock until done, since 217 // closing connections are done by acquiring the write state lock. 218 w.writeState.RUnlock() 219 220 if err != nil { 221 w.notifyReset(err) 222 w.m.encodeError.Inc(1) 223 } 224 225 return err 226 } 227 228 func (w *consumerWriterImpl) Init() { 229 w.wg.Add(1) 230 go func() { 231 w.resetConnectionUntilClose() 232 w.wg.Done() 233 }() 234 235 for i := 0; i < w.connOpts.NumConnections(); i++ { 236 idx := i 237 w.wg.Add(1) 238 go func() { 239 w.readAcksUntilClose(idx) 240 w.wg.Done() 241 }() 242 } 243 244 w.wg.Add(1) 245 go func() { 246 w.flushUntilClose() 247 w.wg.Done() 248 }() 249 } 250 251 func (w *consumerWriterImpl) flushUntilClose() { 252 flushTicker := time.NewTicker(w.connOpts.FlushInterval()) 253 defer flushTicker.Stop() 254 255 for { 256 select { 257 case <-flushTicker.C: 258 w.writeState.RLock() 259 for _, conn := range w.writeState.conns { 260 conn.writeLock.Lock() 261 if err := conn.w.Flush(); err != nil { 262 w.notifyReset(err) 263 } 264 conn.writeLock.Unlock() 265 } 266 // Hold onto the write state lock until done, since 267 // closing connections are done by acquiring the write state lock. 268 w.writeState.RUnlock() 269 case <-w.doneCh: 270 return 271 } 272 } 273 } 274 275 func (w *consumerWriterImpl) resetConnectionUntilClose() { 276 for { 277 select { 278 case <-w.resetCh: 279 // Avoid resetting too frequent. 280 if w.resetTooSoon() { 281 w.m.resetTooSoon.Inc(1) 282 continue 283 } 284 // Connect with retry. 285 connectAllWithRetry := w.newConnectFn(connectOptions{retry: true}) 286 if err := w.resetWithConnectFn(connectAllWithRetry); err != nil { 287 w.m.resetError.Inc(1) 288 w.logger.Error("could not reconnect", zap.String("address", w.addr), zap.Error(err)) 289 continue 290 } 291 w.m.resetSuccess.Inc(1) 292 w.logger.Info("reconnected", zap.String("address", w.addr)) 293 case <-w.doneCh: 294 w.writeState.Lock() 295 for _, c := range w.writeState.conns { 296 c.conn.Close() 297 } 298 w.writeState.Unlock() 299 return 300 } 301 } 302 } 303 304 func (w *consumerWriterImpl) resetTooSoon() bool { 305 w.writeState.RLock() 306 defer w.writeState.RUnlock() 307 return w.nowFn().UnixNano() < w.writeState.lastResetNanos+int64(w.connOpts.ResetDelay()) 308 } 309 310 func (w *consumerWriterImpl) resetWithConnectFn(fn connectAllFn) error { 311 w.writeState.Lock() 312 w.writeState.validConns = false 313 w.writeState.Unlock() 314 conns, err := fn(w.addr) 315 if err != nil { 316 return err 317 } 318 w.reset(resetOptions{ 319 connections: conns, 320 at: w.nowFn(), 321 validConns: true, 322 }) 323 return nil 324 } 325 326 func (w *consumerWriterImpl) readAcksUntilClose(idx int) { 327 for { 328 select { 329 case <-w.doneCh: 330 return 331 default: 332 w.ackRetrier.AttemptWhile(w.continueFn, 333 func() error { 334 return w.readAcks(idx) 335 }) 336 } 337 } 338 } 339 340 func (w *consumerWriterImpl) continueFn(int) bool { 341 return !w.isClosed() 342 } 343 344 func (w *consumerWriterImpl) readAcks(idx int) error { 345 w.writeState.RLock() 346 validConns := w.writeState.validConns 347 conn := w.writeState.conns[idx] 348 w.writeState.RUnlock() 349 if !validConns { 350 w.m.readInvalidConn.Inc(1) 351 return errInvalidConnection 352 } 353 354 // Read from decoder, safe to read from acquired decoder as not re-used. 355 // NB(cw) The proto needs to be cleaned up because the gogo protobuf 356 // unmarshalling will append to the underlying slice. 357 conn.ack.Metadata = conn.ack.Metadata[:0] 358 err := conn.decoder.Decode(&conn.ack) 359 if err != nil { 360 w.notifyReset(err) 361 w.m.decodeError.Inc(1) 362 return err 363 } 364 for _, m := range conn.ack.Metadata { 365 if err := w.router.Ack(newMetadataFromProto(m)); err != nil { 366 w.m.ackError.Inc(1) 367 // This is fine, usually this means the ack has been acked. 368 w.logger.Error("could not ack metadata", zap.Error(err)) 369 } 370 } 371 372 return nil 373 } 374 375 func (w *consumerWriterImpl) Close() { 376 w.writeState.Lock() 377 wasClosed := w.writeState.closed 378 w.writeState.closed = true 379 w.writeState.Unlock() 380 381 if wasClosed { 382 return 383 } 384 385 close(w.doneCh) 386 387 w.wg.Wait() 388 } 389 390 func (w *consumerWriterImpl) notifyReset(err error) { 391 select { 392 case w.resetCh <- struct{}{}: 393 if err != nil { 394 w.logger.Error("connection error", zap.Error(err)) 395 } 396 default: 397 } 398 } 399 400 func (w *consumerWriterImpl) isClosed() bool { 401 w.writeState.RLock() 402 defer w.writeState.RUnlock() 403 return w.writeState.closed 404 } 405 406 type resetOptions struct { 407 connections []io.ReadWriteCloser 408 at time.Time 409 validConns bool 410 } 411 412 func (w *consumerWriterImpl) reset(opts resetOptions) { 413 w.writeState.Lock() 414 prevConns := w.writeState.conns 415 defer func() { 416 w.writeState.Unlock() 417 // Close existing connections outside of locks. 418 for _, c := range prevConns { 419 c.conn.Close() 420 } 421 }() 422 423 var ( 424 wOpts = xio.ResettableWriterOptions{ 425 WriteBufferSize: w.connOpts.WriteBufferSize(), 426 } 427 428 rwOpts = w.opts.DecoderOptions().RWOptions() 429 writerFn = rwOpts.ResettableWriterFn() 430 ) 431 432 w.writeState.conns = make([]*connection, 0, len(opts.connections)) 433 for _, conn := range opts.connections { 434 wr := writerFn(u, wOpts) 435 wr.Reset(conn) 436 437 decoder := proto.NewDecoder(conn, w.opts.DecoderOptions(), w.connOpts.ReadBufferSize()) 438 newConn := &connection{ 439 conn: conn, 440 w: wr, 441 decoder: decoder, 442 } 443 444 w.writeState.conns = append(w.writeState.conns, newConn) 445 } 446 447 w.writeState.lastResetNanos = opts.at.UnixNano() 448 w.writeState.validConns = opts.validConns 449 } 450 451 func (w *consumerWriterImpl) connectNoRetry(addr string) (io.ReadWriteCloser, error) { 452 // Upcast readWriterWithTimeout to the interface; this allows us to mock out the connectNoRetry function in tests. 453 return w.connectNoRetryWithTimeout(addr) 454 } 455 456 func (w *consumerWriterImpl) connectNoRetryWithTimeout(addr string) (readWriterWithTimeout, error) { 457 // N.B.: this is roughly equivalent to what net.DialTimeout does; shouldn't introduce performance regressions. 458 ctx, cancel := context.WithTimeout(context.Background(), w.connOpts.DialTimeout()) 459 defer cancel() 460 461 conn, err := w.dialContext(ctx, addr) 462 if err != nil { 463 w.m.connectError.Inc(1) 464 return readWriterWithTimeout{}, err 465 } 466 tcpConn, ok := conn.(keepAlivable) 467 if !ok { 468 // If using a custom dialer which doesn't return *net.TCPConn, users are responsible for TCP keep alive options 469 // themselves. 470 return newReadWriterWithTimeout(conn, w.connOpts.WriteTimeout(), w.nowFn), nil 471 } 472 if err = tcpConn.SetKeepAlive(true); err != nil { 473 w.m.setKeepAliveError.Inc(1) 474 } 475 keepAlivePeriod := w.connOpts.KeepAlivePeriod() 476 if keepAlivePeriod <= 0 { 477 return newReadWriterWithTimeout(conn, w.connOpts.WriteTimeout(), w.nowFn), nil 478 } 479 if err = tcpConn.SetKeepAlivePeriod(keepAlivePeriod); err != nil { 480 w.m.setKeepAlivePeriodError.Inc(1) 481 } 482 return newReadWriterWithTimeout(conn, w.connOpts.WriteTimeout(), w.nowFn), nil 483 } 484 485 // Make sure net.TCPConn implements this; otherwise bad things will happen. 486 var _ keepAlivable = (*net.TCPConn)(nil) 487 488 type keepAlivable interface { 489 SetKeepAlive(shouldKeepAlive bool) error 490 SetKeepAlivePeriod(d time.Duration) error 491 } 492 493 func (w *consumerWriterImpl) dialContext(ctx context.Context, addr string) (net.Conn, error) { 494 if dialer := w.connOpts.ContextDialer(); dialer != nil { 495 return dialer(ctx, "tcp", addr) 496 } 497 var dialer net.Dialer 498 return dialer.DialContext(ctx, "tcp", addr) 499 } 500 501 type connectOptions struct { 502 retry bool 503 } 504 505 func (w *consumerWriterImpl) newConnectFn(opts connectOptions) connectAllFn { 506 return func(addr string) ([]io.ReadWriteCloser, error) { 507 var ( 508 numConns = w.connOpts.NumConnections() 509 conns = make([]io.ReadWriteCloser, 0, numConns) 510 ) 511 for i := 0; i < numConns; i++ { 512 var ( 513 conn io.ReadWriteCloser 514 fn = func() error { 515 var connectErr error 516 conn, connectErr = w.connectFn(addr) 517 return connectErr 518 } 519 resultErr error 520 ) 521 if !opts.retry { 522 resultErr = fn() 523 } else { 524 resultErr = w.connRetrier.AttemptWhile(w.continueFn, fn) 525 } 526 if resultErr != nil { 527 return nil, resultErr 528 } 529 530 conns = append(conns, conn) 531 } 532 return conns, nil 533 } 534 } 535 536 type readWriterWithTimeout struct { 537 net.Conn 538 539 timeout time.Duration 540 nowFn clock.NowFn 541 } 542 543 func newReadWriterWithTimeout(conn net.Conn, timeout time.Duration, nowFn clock.NowFn) readWriterWithTimeout { 544 return readWriterWithTimeout{ 545 Conn: conn, 546 timeout: timeout, 547 nowFn: nowFn, 548 } 549 } 550 551 func (conn readWriterWithTimeout) Write(p []byte) (int, error) { 552 if conn.timeout > 0 { 553 conn.SetWriteDeadline(conn.nowFn().Add(conn.timeout)) 554 } 555 return conn.Conn.Write(p) 556 } 557 558 type uninitializedReadWriter struct{} 559 560 func (u uninitializedReadWriter) Read(p []byte) (int, error) { return 0, errInvalidConnection } 561 func (u uninitializedReadWriter) Write(p []byte) (int, error) { return 0, errInvalidConnection } 562 func (u uninitializedReadWriter) Close() error { return nil }