github.com/m3db/m3@v1.5.0/src/msg/producer/writer/consumer_writer.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package writer
    22  
    23  import (
    24  	"context"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"net"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/m3db/m3/src/msg/generated/proto/msgpb"
    33  	"github.com/m3db/m3/src/msg/protocol/proto"
    34  	"github.com/m3db/m3/src/x/clock"
    35  	xio "github.com/m3db/m3/src/x/io"
    36  	"github.com/m3db/m3/src/x/retry"
    37  
    38  	"github.com/uber-go/tally"
    39  	"go.uber.org/zap"
    40  )
    41  
    42  const (
    43  	defaultRetryForever = true
    44  )
    45  
    46  var (
    47  	errInvalidConnection = errors.New("connection is invalid")
    48  	u                    uninitializedReadWriter
    49  )
    50  
    51  type consumerWriter interface {
    52  	// Address returns the consumer address.
    53  	Address() string
    54  
    55  	// Write writes the bytes, it is thread safe per connection index.
    56  	Write(connIndex int, b []byte) error
    57  
    58  	// Init initializes the consumer writer.
    59  	Init()
    60  
    61  	// Close closes the consumer writer.
    62  	Close()
    63  }
    64  
    65  type consumerWriterMetrics struct {
    66  	writeInvalidConn        tally.Counter
    67  	readInvalidConn         tally.Counter
    68  	ackError                tally.Counter
    69  	decodeError             tally.Counter
    70  	encodeError             tally.Counter
    71  	resetTooSoon            tally.Counter
    72  	resetSuccess            tally.Counter
    73  	resetError              tally.Counter
    74  	connectError            tally.Counter
    75  	setKeepAliveError       tally.Counter
    76  	setKeepAlivePeriodError tally.Counter
    77  }
    78  
    79  func newConsumerWriterMetrics(scope tally.Scope) consumerWriterMetrics {
    80  	return consumerWriterMetrics{
    81  		writeInvalidConn:        scope.Counter("write-invalid-conn"),
    82  		readInvalidConn:         scope.Counter("read-invalid-conn"),
    83  		ackError:                scope.Counter("ack-error"),
    84  		decodeError:             scope.Counter("decode-error"),
    85  		encodeError:             scope.Counter("encode-error"),
    86  		resetTooSoon:            scope.Counter("reset-too-soon"),
    87  		resetSuccess:            scope.Counter("reset-success"),
    88  		resetError:              scope.Counter("reset-error"),
    89  		connectError:            scope.Counter("connect-error"),
    90  		setKeepAliveError:       scope.Counter("set-keep-alive-error"),
    91  		setKeepAlivePeriodError: scope.Counter("set-keep-alive-period-error"),
    92  	}
    93  }
    94  
    95  type connectFn func(addr string) (io.ReadWriteCloser, error)
    96  
    97  type connectAllFn func(addr string) ([]io.ReadWriteCloser, error)
    98  
    99  type consumerWriterImpl struct {
   100  	writeState consumerWriterImplWriteState
   101  
   102  	addr        string
   103  	router      ackRouter
   104  	opts        Options
   105  	connOpts    ConnectionOptions
   106  	ackRetrier  retry.Retrier
   107  	connRetrier retry.Retrier
   108  	logger      *zap.Logger
   109  
   110  	resetCh chan struct{}
   111  	doneCh  chan struct{}
   112  	wg      sync.WaitGroup
   113  	m       consumerWriterMetrics
   114  
   115  	nowFn     clock.NowFn
   116  	connectFn connectFn
   117  }
   118  
   119  type consumerWriterImplWriteState struct {
   120  	sync.RWMutex
   121  
   122  	closed     bool
   123  	validConns bool
   124  
   125  	// conns keeps active connections.
   126  	// Note: readers will take a reference to this slice with a lock
   127  	// then loop through it and call decode on decoders, so not safe
   128  	// to reuse.
   129  	conns          []*connection
   130  	lastResetNanos int64
   131  }
   132  
   133  type connection struct {
   134  	writeLock sync.Mutex
   135  	conn      io.ReadWriteCloser
   136  	w         xio.ResettableWriter
   137  	decoder   proto.Decoder
   138  	ack       msgpb.Ack
   139  }
   140  
   141  func newConsumerWriter(
   142  	addr string,
   143  	router ackRouter,
   144  	opts Options,
   145  	m consumerWriterMetrics,
   146  ) consumerWriter {
   147  	if opts == nil {
   148  		opts = NewOptions()
   149  	}
   150  
   151  	connOpts := opts.ConnectionOptions()
   152  	w := &consumerWriterImpl{
   153  		addr:        addr,
   154  		router:      router,
   155  		opts:        opts,
   156  		connOpts:    connOpts,
   157  		ackRetrier:  retry.NewRetrier(opts.AckErrorRetryOptions()),
   158  		connRetrier: retry.NewRetrier(connOpts.RetryOptions().SetForever(defaultRetryForever)),
   159  		logger:      opts.InstrumentOptions().Logger(),
   160  		resetCh:     make(chan struct{}, 1),
   161  		doneCh:      make(chan struct{}),
   162  		m:           m,
   163  		nowFn:       time.Now,
   164  	}
   165  	w.connectFn = w.connectNoRetry
   166  
   167  	// Initialize no-op connections since it's valid even if connecting the
   168  	// first time fails to continue to try to write to the writer.
   169  	// Note: Also tests try to break a non-connected writer.
   170  	conns := make([]io.ReadWriteCloser, 0, connOpts.NumConnections())
   171  	for i := 0; i < connOpts.NumConnections(); i++ {
   172  		conns = append(conns, u)
   173  	}
   174  	// NB(r): Reset at epoch since a connection failure should trigger
   175  	// an immediate reset after first connection attempt (if write fails
   176  	// since first connection is with no retry).
   177  	w.reset(resetOptions{
   178  		connections: conns,
   179  		at:          time.Time{},
   180  		validConns:  false,
   181  	})
   182  
   183  	// Try connecting without retry first attempt.
   184  	connectAllNoRetry := w.newConnectFn(connectOptions{retry: false})
   185  	if err := w.resetWithConnectFn(connectAllNoRetry); err != nil {
   186  		w.notifyReset(err)
   187  	}
   188  	return w
   189  }
   190  
   191  func (w *consumerWriterImpl) Address() string {
   192  	return w.addr
   193  }
   194  
   195  // Write should fail fast so that the write could be tried on other
   196  // consumer writers that are sharing the message queue.
   197  func (w *consumerWriterImpl) Write(connIndex int, b []byte) error {
   198  	w.writeState.RLock()
   199  	if !w.writeState.validConns || len(w.writeState.conns) == 0 {
   200  		w.writeState.RUnlock()
   201  		w.m.writeInvalidConn.Inc(1)
   202  		return errInvalidConnection
   203  	}
   204  	if connIndex < 0 || connIndex >= len(w.writeState.conns) {
   205  		w.writeState.RUnlock()
   206  		return fmt.Errorf("connection index out of range: %d", connIndex)
   207  	}
   208  
   209  	writeConn := w.writeState.conns[connIndex]
   210  
   211  	// Make sure only writer to this connection.
   212  	writeConn.writeLock.Lock()
   213  	_, err := writeConn.w.Write(b)
   214  	writeConn.writeLock.Unlock()
   215  
   216  	// Hold onto the write state lock until done, since
   217  	// closing connections are done by acquiring the write state lock.
   218  	w.writeState.RUnlock()
   219  
   220  	if err != nil {
   221  		w.notifyReset(err)
   222  		w.m.encodeError.Inc(1)
   223  	}
   224  
   225  	return err
   226  }
   227  
   228  func (w *consumerWriterImpl) Init() {
   229  	w.wg.Add(1)
   230  	go func() {
   231  		w.resetConnectionUntilClose()
   232  		w.wg.Done()
   233  	}()
   234  
   235  	for i := 0; i < w.connOpts.NumConnections(); i++ {
   236  		idx := i
   237  		w.wg.Add(1)
   238  		go func() {
   239  			w.readAcksUntilClose(idx)
   240  			w.wg.Done()
   241  		}()
   242  	}
   243  
   244  	w.wg.Add(1)
   245  	go func() {
   246  		w.flushUntilClose()
   247  		w.wg.Done()
   248  	}()
   249  }
   250  
   251  func (w *consumerWriterImpl) flushUntilClose() {
   252  	flushTicker := time.NewTicker(w.connOpts.FlushInterval())
   253  	defer flushTicker.Stop()
   254  
   255  	for {
   256  		select {
   257  		case <-flushTicker.C:
   258  			w.writeState.RLock()
   259  			for _, conn := range w.writeState.conns {
   260  				conn.writeLock.Lock()
   261  				if err := conn.w.Flush(); err != nil {
   262  					w.notifyReset(err)
   263  				}
   264  				conn.writeLock.Unlock()
   265  			}
   266  			// Hold onto the write state lock until done, since
   267  			// closing connections are done by acquiring the write state lock.
   268  			w.writeState.RUnlock()
   269  		case <-w.doneCh:
   270  			return
   271  		}
   272  	}
   273  }
   274  
   275  func (w *consumerWriterImpl) resetConnectionUntilClose() {
   276  	for {
   277  		select {
   278  		case <-w.resetCh:
   279  			// Avoid resetting too frequent.
   280  			if w.resetTooSoon() {
   281  				w.m.resetTooSoon.Inc(1)
   282  				continue
   283  			}
   284  			// Connect with retry.
   285  			connectAllWithRetry := w.newConnectFn(connectOptions{retry: true})
   286  			if err := w.resetWithConnectFn(connectAllWithRetry); err != nil {
   287  				w.m.resetError.Inc(1)
   288  				w.logger.Error("could not reconnect", zap.String("address", w.addr), zap.Error(err))
   289  				continue
   290  			}
   291  			w.m.resetSuccess.Inc(1)
   292  			w.logger.Info("reconnected", zap.String("address", w.addr))
   293  		case <-w.doneCh:
   294  			w.writeState.Lock()
   295  			for _, c := range w.writeState.conns {
   296  				c.conn.Close()
   297  			}
   298  			w.writeState.Unlock()
   299  			return
   300  		}
   301  	}
   302  }
   303  
   304  func (w *consumerWriterImpl) resetTooSoon() bool {
   305  	w.writeState.RLock()
   306  	defer w.writeState.RUnlock()
   307  	return w.nowFn().UnixNano() < w.writeState.lastResetNanos+int64(w.connOpts.ResetDelay())
   308  }
   309  
   310  func (w *consumerWriterImpl) resetWithConnectFn(fn connectAllFn) error {
   311  	w.writeState.Lock()
   312  	w.writeState.validConns = false
   313  	w.writeState.Unlock()
   314  	conns, err := fn(w.addr)
   315  	if err != nil {
   316  		return err
   317  	}
   318  	w.reset(resetOptions{
   319  		connections: conns,
   320  		at:          w.nowFn(),
   321  		validConns:  true,
   322  	})
   323  	return nil
   324  }
   325  
   326  func (w *consumerWriterImpl) readAcksUntilClose(idx int) {
   327  	for {
   328  		select {
   329  		case <-w.doneCh:
   330  			return
   331  		default:
   332  			w.ackRetrier.AttemptWhile(w.continueFn,
   333  				func() error {
   334  					return w.readAcks(idx)
   335  				})
   336  		}
   337  	}
   338  }
   339  
   340  func (w *consumerWriterImpl) continueFn(int) bool {
   341  	return !w.isClosed()
   342  }
   343  
   344  func (w *consumerWriterImpl) readAcks(idx int) error {
   345  	w.writeState.RLock()
   346  	validConns := w.writeState.validConns
   347  	conn := w.writeState.conns[idx]
   348  	w.writeState.RUnlock()
   349  	if !validConns {
   350  		w.m.readInvalidConn.Inc(1)
   351  		return errInvalidConnection
   352  	}
   353  
   354  	// Read from decoder, safe to read from acquired decoder as not re-used.
   355  	// NB(cw) The proto needs to be cleaned up because the gogo protobuf
   356  	// unmarshalling will append to the underlying slice.
   357  	conn.ack.Metadata = conn.ack.Metadata[:0]
   358  	err := conn.decoder.Decode(&conn.ack)
   359  	if err != nil {
   360  		w.notifyReset(err)
   361  		w.m.decodeError.Inc(1)
   362  		return err
   363  	}
   364  	for _, m := range conn.ack.Metadata {
   365  		if err := w.router.Ack(newMetadataFromProto(m)); err != nil {
   366  			w.m.ackError.Inc(1)
   367  			// This is fine, usually this means the ack has been acked.
   368  			w.logger.Error("could not ack metadata", zap.Error(err))
   369  		}
   370  	}
   371  
   372  	return nil
   373  }
   374  
   375  func (w *consumerWriterImpl) Close() {
   376  	w.writeState.Lock()
   377  	wasClosed := w.writeState.closed
   378  	w.writeState.closed = true
   379  	w.writeState.Unlock()
   380  
   381  	if wasClosed {
   382  		return
   383  	}
   384  
   385  	close(w.doneCh)
   386  
   387  	w.wg.Wait()
   388  }
   389  
   390  func (w *consumerWriterImpl) notifyReset(err error) {
   391  	select {
   392  	case w.resetCh <- struct{}{}:
   393  		if err != nil {
   394  			w.logger.Error("connection error", zap.Error(err))
   395  		}
   396  	default:
   397  	}
   398  }
   399  
   400  func (w *consumerWriterImpl) isClosed() bool {
   401  	w.writeState.RLock()
   402  	defer w.writeState.RUnlock()
   403  	return w.writeState.closed
   404  }
   405  
   406  type resetOptions struct {
   407  	connections []io.ReadWriteCloser
   408  	at          time.Time
   409  	validConns  bool
   410  }
   411  
   412  func (w *consumerWriterImpl) reset(opts resetOptions) {
   413  	w.writeState.Lock()
   414  	prevConns := w.writeState.conns
   415  	defer func() {
   416  		w.writeState.Unlock()
   417  		// Close existing connections outside of locks.
   418  		for _, c := range prevConns {
   419  			c.conn.Close()
   420  		}
   421  	}()
   422  
   423  	var (
   424  		wOpts = xio.ResettableWriterOptions{
   425  			WriteBufferSize: w.connOpts.WriteBufferSize(),
   426  		}
   427  
   428  		rwOpts   = w.opts.DecoderOptions().RWOptions()
   429  		writerFn = rwOpts.ResettableWriterFn()
   430  	)
   431  
   432  	w.writeState.conns = make([]*connection, 0, len(opts.connections))
   433  	for _, conn := range opts.connections {
   434  		wr := writerFn(u, wOpts)
   435  		wr.Reset(conn)
   436  
   437  		decoder := proto.NewDecoder(conn, w.opts.DecoderOptions(), w.connOpts.ReadBufferSize())
   438  		newConn := &connection{
   439  			conn:    conn,
   440  			w:       wr,
   441  			decoder: decoder,
   442  		}
   443  
   444  		w.writeState.conns = append(w.writeState.conns, newConn)
   445  	}
   446  
   447  	w.writeState.lastResetNanos = opts.at.UnixNano()
   448  	w.writeState.validConns = opts.validConns
   449  }
   450  
   451  func (w *consumerWriterImpl) connectNoRetry(addr string) (io.ReadWriteCloser, error) {
   452  	// Upcast readWriterWithTimeout to the interface; this allows us to mock out the connectNoRetry function in tests.
   453  	return w.connectNoRetryWithTimeout(addr)
   454  }
   455  
   456  func (w *consumerWriterImpl) connectNoRetryWithTimeout(addr string) (readWriterWithTimeout, error) {
   457  	// N.B.: this is roughly equivalent to what net.DialTimeout does; shouldn't introduce performance regressions.
   458  	ctx, cancel := context.WithTimeout(context.Background(), w.connOpts.DialTimeout())
   459  	defer cancel()
   460  
   461  	conn, err := w.dialContext(ctx, addr)
   462  	if err != nil {
   463  		w.m.connectError.Inc(1)
   464  		return readWriterWithTimeout{}, err
   465  	}
   466  	tcpConn, ok := conn.(keepAlivable)
   467  	if !ok {
   468  		// If using a custom dialer which doesn't return *net.TCPConn, users are responsible for TCP keep alive options
   469  		// themselves.
   470  		return newReadWriterWithTimeout(conn, w.connOpts.WriteTimeout(), w.nowFn), nil
   471  	}
   472  	if err = tcpConn.SetKeepAlive(true); err != nil {
   473  		w.m.setKeepAliveError.Inc(1)
   474  	}
   475  	keepAlivePeriod := w.connOpts.KeepAlivePeriod()
   476  	if keepAlivePeriod <= 0 {
   477  		return newReadWriterWithTimeout(conn, w.connOpts.WriteTimeout(), w.nowFn), nil
   478  	}
   479  	if err = tcpConn.SetKeepAlivePeriod(keepAlivePeriod); err != nil {
   480  		w.m.setKeepAlivePeriodError.Inc(1)
   481  	}
   482  	return newReadWriterWithTimeout(conn, w.connOpts.WriteTimeout(), w.nowFn), nil
   483  }
   484  
   485  // Make sure net.TCPConn implements this; otherwise bad things will happen.
   486  var _ keepAlivable = (*net.TCPConn)(nil)
   487  
   488  type keepAlivable interface {
   489  	SetKeepAlive(shouldKeepAlive bool) error
   490  	SetKeepAlivePeriod(d time.Duration) error
   491  }
   492  
   493  func (w *consumerWriterImpl) dialContext(ctx context.Context, addr string) (net.Conn, error) {
   494  	if dialer := w.connOpts.ContextDialer(); dialer != nil {
   495  		return dialer(ctx, "tcp", addr)
   496  	}
   497  	var dialer net.Dialer
   498  	return dialer.DialContext(ctx, "tcp", addr)
   499  }
   500  
   501  type connectOptions struct {
   502  	retry bool
   503  }
   504  
   505  func (w *consumerWriterImpl) newConnectFn(opts connectOptions) connectAllFn {
   506  	return func(addr string) ([]io.ReadWriteCloser, error) {
   507  		var (
   508  			numConns = w.connOpts.NumConnections()
   509  			conns    = make([]io.ReadWriteCloser, 0, numConns)
   510  		)
   511  		for i := 0; i < numConns; i++ {
   512  			var (
   513  				conn io.ReadWriteCloser
   514  				fn   = func() error {
   515  					var connectErr error
   516  					conn, connectErr = w.connectFn(addr)
   517  					return connectErr
   518  				}
   519  				resultErr error
   520  			)
   521  			if !opts.retry {
   522  				resultErr = fn()
   523  			} else {
   524  				resultErr = w.connRetrier.AttemptWhile(w.continueFn, fn)
   525  			}
   526  			if resultErr != nil {
   527  				return nil, resultErr
   528  			}
   529  
   530  			conns = append(conns, conn)
   531  		}
   532  		return conns, nil
   533  	}
   534  }
   535  
   536  type readWriterWithTimeout struct {
   537  	net.Conn
   538  
   539  	timeout time.Duration
   540  	nowFn   clock.NowFn
   541  }
   542  
   543  func newReadWriterWithTimeout(conn net.Conn, timeout time.Duration, nowFn clock.NowFn) readWriterWithTimeout {
   544  	return readWriterWithTimeout{
   545  		Conn:    conn,
   546  		timeout: timeout,
   547  		nowFn:   nowFn,
   548  	}
   549  }
   550  
   551  func (conn readWriterWithTimeout) Write(p []byte) (int, error) {
   552  	if conn.timeout > 0 {
   553  		conn.SetWriteDeadline(conn.nowFn().Add(conn.timeout))
   554  	}
   555  	return conn.Conn.Write(p)
   556  }
   557  
   558  type uninitializedReadWriter struct{}
   559  
   560  func (u uninitializedReadWriter) Read(p []byte) (int, error)  { return 0, errInvalidConnection }
   561  func (u uninitializedReadWriter) Write(p []byte) (int, error) { return 0, errInvalidConnection }
   562  func (u uninitializedReadWriter) Close() error                { return nil }