github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/aggregator/client/conn.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package client
    22  
    23  import (
    24  	"context"
    25  	"errors"
    26  	"math/rand"
    27  	"net"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/m3db/m3/src/x/clock"
    32  	xio "github.com/m3db/m3/src/x/io"
    33  	xnet "github.com/m3db/m3/src/x/net"
    34  	"github.com/m3db/m3/src/x/retry"
    35  
    36  	"github.com/uber-go/tally"
    37  )
    38  
    39  const (
    40  	tcpProtocol = "tcp"
    41  )
    42  
    43  var (
    44  	errNoActiveConnection = errors.New("no active connection")
    45  	errInvalidConnection  = errors.New("connection is invalid")
    46  	uninitWriter          uninitializedWriter
    47  )
    48  
    49  type (
    50  	sleepFn           func(time.Duration)
    51  	connectWithLockFn func() error
    52  	writeWithLockFn   func([]byte) error
    53  )
    54  
    55  // connection is a persistent connection that retries establishing
    56  // connection with exponential backoff if the connection goes down.
    57  type connection struct {
    58  	metrics                 connectionMetrics
    59  	writeRetryOpts          retry.Options
    60  	writer                  xio.ResettableWriter
    61  	connectWithLockFn       connectWithLockFn
    62  	sleepFn                 sleepFn
    63  	nowFn                   clock.NowFn
    64  	conn                    net.Conn
    65  	rngFn                   retry.RngFn
    66  	writeWithLockFn         writeWithLockFn
    67  	addr                    string
    68  	maxDuration             time.Duration
    69  	maxThreshold            int
    70  	multiplier              int
    71  	initThreshold           int
    72  	threshold               int
    73  	lastConnectAttemptNanos int64
    74  	writeTimeout            time.Duration
    75  	connTimeout             time.Duration
    76  	numFailures             int
    77  	mtx                     sync.Mutex
    78  	keepAlive               bool
    79  	dialer                  xnet.ContextDialerFn
    80  }
    81  
    82  // newConnection creates a new connection.
    83  func newConnection(addr string, opts ConnectionOptions) *connection {
    84  	c := &connection{
    85  		addr:           addr,
    86  		connTimeout:    opts.ConnectionTimeout(),
    87  		writeTimeout:   opts.WriteTimeout(),
    88  		keepAlive:      opts.ConnectionKeepAlive(),
    89  		initThreshold:  opts.InitReconnectThreshold(),
    90  		multiplier:     opts.ReconnectThresholdMultiplier(),
    91  		maxThreshold:   opts.MaxReconnectThreshold(),
    92  		maxDuration:    opts.MaxReconnectDuration(),
    93  		writeRetryOpts: opts.WriteRetryOptions(),
    94  		dialer:         opts.ContextDialer(),
    95  		rngFn:          rand.New(rand.NewSource(time.Now().UnixNano())).Int63n,
    96  		nowFn:          opts.ClockOptions().NowFn(),
    97  		sleepFn:        time.Sleep,
    98  		threshold:      opts.InitReconnectThreshold(),
    99  		writer: opts.RWOptions().ResettableWriterFn()(
   100  			uninitWriter,
   101  			xio.ResettableWriterOptions{WriteBufferSize: 0},
   102  		),
   103  		metrics: newConnectionMetrics(opts.InstrumentOptions().MetricsScope()),
   104  	}
   105  	c.connectWithLockFn = c.connectWithLock
   106  	c.writeWithLockFn = c.writeWithLock
   107  
   108  	return c
   109  }
   110  
   111  // Write sends data onto the connection, and attempts to re-establish
   112  // connection if the connection is down.
   113  func (c *connection) Write(data []byte) error {
   114  	var err error
   115  	c.mtx.Lock()
   116  	if c.conn == nil {
   117  		if err = c.checkReconnectWithLock(); err != nil {
   118  			c.numFailures++
   119  			c.mtx.Unlock()
   120  			return err
   121  		}
   122  	}
   123  	if err = c.writeAttemptWithLock(data); err == nil {
   124  		c.mtx.Unlock()
   125  		return nil
   126  	}
   127  	for i := 1; i <= c.writeRetryOpts.MaxRetries(); i++ {
   128  		if backoffDur := time.Duration(retry.BackoffNanos(
   129  			i,
   130  			c.writeRetryOpts.Jitter(),
   131  			c.writeRetryOpts.BackoffFactor(),
   132  			c.writeRetryOpts.InitialBackoff(),
   133  			c.writeRetryOpts.MaxBackoff(),
   134  			c.rngFn,
   135  		)); backoffDur > 0 {
   136  			c.sleepFn(backoffDur)
   137  		}
   138  		c.metrics.writeRetries.Inc(1)
   139  		if err = c.writeAttemptWithLock(data); err == nil {
   140  			c.mtx.Unlock()
   141  			return nil
   142  		}
   143  	}
   144  	c.numFailures++
   145  	c.mtx.Unlock()
   146  	return err
   147  }
   148  
   149  func (c *connection) Close() {
   150  	c.mtx.Lock()
   151  	c.closeWithLock()
   152  	c.mtx.Unlock()
   153  }
   154  
   155  // writeAttemptWithLock attempts to establish a new connection and writes raw bytes
   156  // to the connection while holding the write lock.
   157  // If the write succeeds, c.conn is guaranteed to be a valid connection on return.
   158  // If the write fails, c.conn is guaranteed to be nil on return.
   159  func (c *connection) writeAttemptWithLock(data []byte) error {
   160  	if c.conn == nil {
   161  		if err := c.connectWithLockFn(); err != nil {
   162  			return err
   163  		}
   164  	}
   165  	if err := c.writeWithLockFn(data); err != nil {
   166  		c.closeWithLock()
   167  		return err
   168  	}
   169  	return nil
   170  }
   171  
   172  func (c *connection) connectWithLock() error {
   173  	// TODO: propagate this all the way up the callstack.
   174  	ctx := context.TODO()
   175  
   176  	c.lastConnectAttemptNanos = c.nowFn().UnixNano()
   177  
   178  	ctx, cancel := context.WithTimeout(ctx, c.connTimeout)
   179  	defer cancel()
   180  
   181  	conn, err := c.dialContext(ctx, c.addr)
   182  	if err != nil {
   183  		c.metrics.connectError.Inc(1)
   184  		return err
   185  	}
   186  
   187  	// N.B.: If using a custom dialer which doesn't return *net.TCPConn, users are responsible for TCP keep alive options
   188  	// themselves.
   189  	if tcpConn, ok := conn.(keepAlivable); ok {
   190  		if err := tcpConn.SetKeepAlive(c.keepAlive); err != nil {
   191  			c.metrics.setKeepAliveError.Inc(1)
   192  		}
   193  	}
   194  
   195  	if c.conn != nil {
   196  		c.conn.Close() // nolint: errcheck
   197  	}
   198  
   199  	c.conn = conn
   200  	c.writer.Reset(conn)
   201  	return nil
   202  }
   203  
   204  // Make sure net.TCPConn implements this; otherwise bad things will happen.
   205  var _ keepAlivable = (*net.TCPConn)(nil)
   206  
   207  type keepAlivable interface {
   208  	SetKeepAlive(shouldKeepAlive bool) error
   209  }
   210  
   211  func (c *connection) dialContext(ctx context.Context, addr string) (net.Conn, error) {
   212  	if dialer := c.dialer; dialer != nil {
   213  		return dialer(ctx, tcpProtocol, addr)
   214  	}
   215  	var dialer net.Dialer
   216  	return dialer.DialContext(ctx, tcpProtocol, addr)
   217  }
   218  
   219  func (c *connection) checkReconnectWithLock() error {
   220  	// If we haven't accumulated enough failures to warrant another reconnect
   221  	// and we haven't past the maximum duration since the last time we attempted
   222  	// to connect then we simply return false without reconnecting.
   223  	// If we exhausted maximum allowed failures then we will retry only based on
   224  	// maximum duration since the last attempt.
   225  	enoughFailuresToRetry := c.numFailures >= c.threshold
   226  	exhaustedMaxFailures := c.numFailures >= c.maxThreshold
   227  	sufficientTimePassed := c.nowFn().UnixNano()-c.lastConnectAttemptNanos >= c.maxDuration.Nanoseconds()
   228  	if !sufficientTimePassed && (exhaustedMaxFailures || !enoughFailuresToRetry) {
   229  		return errNoActiveConnection
   230  	}
   231  	err := c.connectWithLockFn()
   232  	if err == nil {
   233  		c.resetWithLock()
   234  		return nil
   235  	}
   236  
   237  	// Only raise the threshold when it is crossed, not when the max duration is reached.
   238  	if enoughFailuresToRetry && c.threshold < c.maxThreshold {
   239  		newThreshold := c.threshold * c.multiplier
   240  		if newThreshold > c.maxThreshold {
   241  			newThreshold = c.maxThreshold
   242  		}
   243  		c.threshold = newThreshold
   244  	}
   245  	return err
   246  }
   247  
   248  func (c *connection) writeWithLock(data []byte) error {
   249  	if err := c.conn.SetWriteDeadline(c.nowFn().Add(c.writeTimeout)); err != nil {
   250  		c.metrics.setWriteDeadlineError.Inc(1)
   251  	}
   252  	if _, err := c.writer.Write(data); err != nil {
   253  		c.metrics.writeError.Inc(1)
   254  		return err
   255  	}
   256  	if err := c.writer.Flush(); err != nil {
   257  		c.metrics.writeError.Inc(1)
   258  		return err
   259  	}
   260  	return nil
   261  }
   262  
   263  func (c *connection) resetWithLock() {
   264  	c.numFailures = 0
   265  	c.threshold = c.initThreshold
   266  }
   267  
   268  func (c *connection) closeWithLock() {
   269  	if c.conn != nil {
   270  		c.conn.Close() // nolint: errcheck
   271  	}
   272  	c.conn = nil
   273  }
   274  
   275  const (
   276  	errorMetric     = "errors"
   277  	errorMetricType = "error-type"
   278  )
   279  
   280  type connectionMetrics struct {
   281  	connectError          tally.Counter
   282  	writeError            tally.Counter
   283  	writeRetries          tally.Counter
   284  	setKeepAliveError     tally.Counter
   285  	setWriteDeadlineError tally.Counter
   286  }
   287  
   288  func newConnectionMetrics(scope tally.Scope) connectionMetrics {
   289  	return connectionMetrics{
   290  		connectError: scope.Tagged(map[string]string{errorMetricType: "connect"}).
   291  			Counter(errorMetric),
   292  		writeError: scope.Tagged(map[string]string{errorMetricType: "write"}).
   293  			Counter(errorMetric),
   294  		writeRetries: scope.Tagged(map[string]string{"action": "write"}).Counter("retries"),
   295  		setKeepAliveError: scope.Tagged(map[string]string{errorMetricType: "tcp-keep-alive"}).
   296  			Counter(errorMetric),
   297  		setWriteDeadlineError: scope.Tagged(map[string]string{errorMetricType: "set-write-deadline"}).
   298  			Counter(errorMetric),
   299  	}
   300  }
   301  
   302  type uninitializedWriter struct{}
   303  
   304  func (u uninitializedWriter) Write(p []byte) (int, error) { return 0, errInvalidConnection }
   305  func (u uninitializedWriter) Close() error                { return nil }