github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/rpc/nodedialer/nodedialer.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package nodedialer
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"net"
    17  	"time"
    18  	"unsafe"
    19  
    20  	circuit "github.com/cockroachdb/circuitbreaker"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/rpc"
    25  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    28  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    29  	"github.com/cockroachdb/errors"
    30  	"google.golang.org/grpc"
    31  )
    32  
    33  // No more than one failure to connect to a given node will be logged in the given interval.
    34  const logPerNodeFailInterval = time.Minute
    35  
    36  type wrappedBreaker struct {
    37  	*circuit.Breaker
    38  	log.EveryN
    39  }
    40  
    41  // An AddressResolver translates NodeIDs into addresses.
    42  type AddressResolver func(roachpb.NodeID) (net.Addr, error)
    43  
    44  // A Dialer wraps an *rpc.Context for dialing based on node IDs. For each node,
    45  // it maintains a circuit breaker that prevents rapid connection attempts and
    46  // provides hints to the callers on whether to log the outcome of the operation.
    47  type Dialer struct {
    48  	rpcContext *rpc.Context
    49  	resolver   AddressResolver
    50  
    51  	breakers [rpc.NumConnectionClasses]syncutil.IntMap // map[roachpb.NodeID]*wrappedBreaker
    52  }
    53  
    54  // New initializes a Dialer.
    55  func New(rpcContext *rpc.Context, resolver AddressResolver) *Dialer {
    56  	return &Dialer{
    57  		rpcContext: rpcContext,
    58  		resolver:   resolver,
    59  	}
    60  }
    61  
    62  // Stopper returns this node dialer's Stopper.
    63  // TODO(bdarnell): This is a bit of a hack for kv/transport_race.go
    64  func (n *Dialer) Stopper() *stop.Stopper {
    65  	return n.rpcContext.Stopper
    66  }
    67  
    68  // Silence lint warning because this method is only used in race builds.
    69  var _ = (*Dialer).Stopper
    70  
    71  // Dial returns a grpc connection to the given node. It logs whenever the
    72  // node first becomes unreachable or reachable.
    73  func (n *Dialer) Dial(
    74  	ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass,
    75  ) (_ *grpc.ClientConn, err error) {
    76  	if n == nil || n.resolver == nil {
    77  		return nil, errors.New("no node dialer configured")
    78  	}
    79  	// Don't trip the breaker if we're already canceled.
    80  	if ctxErr := ctx.Err(); ctxErr != nil {
    81  		return nil, ctxErr
    82  	}
    83  	breaker := n.getBreaker(nodeID, class)
    84  	addr, err := n.resolver(nodeID)
    85  	if err != nil {
    86  		err = errors.Wrapf(err, "failed to resolve n%d", nodeID)
    87  		breaker.Fail(err)
    88  		return nil, err
    89  	}
    90  	return n.dial(ctx, nodeID, addr, breaker, class)
    91  }
    92  
    93  // DialNoBreaker ignores the breaker if there is an error dialing. This function
    94  // should only be used when there is good reason to believe that the node is reachable.
    95  func (n *Dialer) DialNoBreaker(
    96  	ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass,
    97  ) (_ *grpc.ClientConn, err error) {
    98  	if n == nil || n.resolver == nil {
    99  		return nil, errors.New("no node dialer configured")
   100  	}
   101  	addr, err := n.resolver(nodeID)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  	return n.dial(ctx, nodeID, addr, nil /* breaker */, class)
   106  }
   107  
   108  // DialInternalClient is a specialization of DialClass for callers that
   109  // want a roachpb.InternalClient. This supports an optimization to bypass the
   110  // network for the local node. Returns a context.Context which should be used
   111  // when making RPC calls on the returned server. (This context is annotated to
   112  // mark this request as in-process and bypass ctx.Peer checks).
   113  func (n *Dialer) DialInternalClient(
   114  	ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass,
   115  ) (context.Context, roachpb.InternalClient, error) {
   116  	if n == nil || n.resolver == nil {
   117  		return nil, nil, errors.New("no node dialer configured")
   118  	}
   119  	addr, err := n.resolver(nodeID)
   120  	if err != nil {
   121  		return nil, nil, err
   122  	}
   123  	if localClient := n.rpcContext.GetLocalInternalClientForAddr(addr.String(), nodeID); localClient != nil {
   124  		log.VEvent(ctx, 2, "sending request to local client")
   125  
   126  		// Create a new context from the existing one with the "local request" field set.
   127  		// This tells the handler that this is an in-process request, bypassing ctx.Peer checks.
   128  		localCtx := grpcutil.NewLocalRequestContext(ctx)
   129  
   130  		return localCtx, localClient, nil
   131  	}
   132  	log.VEventf(ctx, 2, "sending request to %s", addr)
   133  	conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), class)
   134  	if err != nil {
   135  		return nil, nil, err
   136  	}
   137  	return ctx, roachpb.NewInternalClient(conn), err
   138  }
   139  
   140  // dial performs the dialing of the remote connection. If breaker is nil,
   141  // then perform this logic without using any breaker functionality.
   142  func (n *Dialer) dial(
   143  	ctx context.Context,
   144  	nodeID roachpb.NodeID,
   145  	addr net.Addr,
   146  	breaker *wrappedBreaker,
   147  	class rpc.ConnectionClass,
   148  ) (_ *grpc.ClientConn, err error) {
   149  	// Don't trip the breaker if we're already canceled.
   150  	if ctxErr := ctx.Err(); ctxErr != nil {
   151  		return nil, ctxErr
   152  	}
   153  	if breaker != nil && !breaker.Ready() {
   154  		err = errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID)
   155  		return nil, err
   156  	}
   157  	defer func() {
   158  		// Enforce a minimum interval between warnings for failed connections.
   159  		if err != nil && ctx.Err() == nil && breaker != nil && breaker.ShouldLog() {
   160  			log.Infof(ctx, "unable to connect to n%d: %s", nodeID, err)
   161  		}
   162  	}()
   163  	conn, err := n.rpcContext.GRPCDialNode(addr.String(), nodeID, class).Connect(ctx)
   164  	if err != nil {
   165  		// If we were canceled during the dial, don't trip the breaker.
   166  		if ctxErr := ctx.Err(); ctxErr != nil {
   167  			return nil, ctxErr
   168  		}
   169  		err = errors.Wrapf(err, "failed to connect to n%d at %v", nodeID, addr)
   170  		if breaker != nil {
   171  			breaker.Fail(err)
   172  		}
   173  		return nil, err
   174  	}
   175  	// Check to see if the connection is in the transient failure state. This can
   176  	// happen if the connection already existed, but a recent heartbeat has
   177  	// failed and we haven't yet torn down the connection.
   178  	err = grpcutil.ConnectionReady(conn)
   179  	if err := grpcutil.ConnectionReady(conn); err != nil {
   180  		err = errors.Wrapf(err, "failed to check for ready connection to n%d at %v", nodeID, addr)
   181  		if breaker != nil {
   182  			breaker.Fail(err)
   183  		}
   184  		return nil, err
   185  	}
   186  
   187  	// TODO(bdarnell): Reconcile the different health checks and circuit breaker
   188  	// behavior in this file. Note that this different behavior causes problems
   189  	// for higher-levels in the system. For example, DistSQL checks for
   190  	// ConnHealth when scheduling processors, but can then see attempts to send
   191  	// RPCs fail when dial fails due to an open breaker. Reset the breaker here
   192  	// as a stop-gap before the reconciliation occurs.
   193  	if breaker != nil {
   194  		breaker.Success()
   195  	}
   196  	return conn, nil
   197  }
   198  
   199  // ConnHealth returns nil if we have an open connection of the request
   200  // class to the given node that succeeded on its most recent heartbeat. See the
   201  // method of the same name on rpc.Context for more details.
   202  func (n *Dialer) ConnHealth(nodeID roachpb.NodeID, class rpc.ConnectionClass) error {
   203  	if n == nil || n.resolver == nil {
   204  		return errors.New("no node dialer configured")
   205  	}
   206  	if !n.getBreaker(nodeID, class).Ready() {
   207  		return circuit.ErrBreakerOpen
   208  	}
   209  	addr, err := n.resolver(nodeID)
   210  	if err != nil {
   211  		return err
   212  	}
   213  	// TODO(bdarnell): GRPCDialNode should detect local addresses and return
   214  	// a dummy connection instead of requiring callers to do this check.
   215  	if n.rpcContext.GetLocalInternalClientForAddr(addr.String(), nodeID) != nil {
   216  		// The local client is always considered healthy.
   217  		return nil
   218  	}
   219  	conn := n.rpcContext.GRPCDialNode(addr.String(), nodeID, class)
   220  	return conn.Health()
   221  }
   222  
   223  // GetCircuitBreaker retrieves the circuit breaker for connections to the
   224  // given node. The breaker should not be mutated as this affects all connections
   225  // dialing to that node through this NodeDialer.
   226  func (n *Dialer) GetCircuitBreaker(
   227  	nodeID roachpb.NodeID, class rpc.ConnectionClass,
   228  ) *circuit.Breaker {
   229  	return n.getBreaker(nodeID, class).Breaker
   230  }
   231  
   232  func (n *Dialer) getBreaker(nodeID roachpb.NodeID, class rpc.ConnectionClass) *wrappedBreaker {
   233  	breakers := &n.breakers[class]
   234  	value, ok := breakers.Load(int64(nodeID))
   235  	if !ok {
   236  		name := fmt.Sprintf("rpc %v [n%d]", n.rpcContext.Config.Addr, nodeID)
   237  		breaker := &wrappedBreaker{Breaker: n.rpcContext.NewBreaker(name), EveryN: log.Every(logPerNodeFailInterval)}
   238  		value, _ = breakers.LoadOrStore(int64(nodeID), unsafe.Pointer(breaker))
   239  	}
   240  	return (*wrappedBreaker)(value)
   241  }
   242  
   243  type dialerAdapter Dialer
   244  
   245  func (da *dialerAdapter) Ready(nodeID roachpb.NodeID) bool {
   246  	return (*Dialer)(da).GetCircuitBreaker(nodeID, rpc.DefaultClass).Ready()
   247  }
   248  
   249  func (da *dialerAdapter) Dial(ctx context.Context, nodeID roachpb.NodeID) (ctpb.Client, error) {
   250  	c, err := (*Dialer)(da).Dial(ctx, nodeID, rpc.DefaultClass)
   251  	if err != nil {
   252  		return nil, err
   253  	}
   254  	return ctpb.NewClosedTimestampClient(c).Get(ctx)
   255  }
   256  
   257  var _ closedts.Dialer = (*Dialer)(nil).CTDialer()
   258  
   259  // CTDialer wraps the NodeDialer into a closedts.Dialer.
   260  func (n *Dialer) CTDialer() closedts.Dialer {
   261  	return (*dialerAdapter)(n)
   262  }