github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/transport.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvcoord
    12  
    13  import (
    14  	"context"
    15  	"sort"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/kv"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/rpc"
    21  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    24  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    25  	"github.com/cockroachdb/errors"
    26  	opentracing "github.com/opentracing/opentracing-go"
    27  )
    28  
    29  // A SendOptions structure describes the algorithm for sending RPCs to one or
    30  // more replicas, depending on error conditions and how many successful
    31  // responses are required.
    32  type SendOptions struct {
    33  	class   rpc.ConnectionClass
    34  	metrics *DistSenderMetrics
    35  }
    36  
    37  type batchClient struct {
    38  	replica   roachpb.ReplicaDescriptor
    39  	healthy   bool
    40  	retryable bool
    41  	deadline  time.Time
    42  }
    43  
    44  // TransportFactory encapsulates all interaction with the RPC
    45  // subsystem, allowing it to be mocked out for testing. The factory
    46  // function returns a Transport object which is used to send requests
    47  // to one or more replicas in the slice.
    48  //
    49  // In addition to actually sending RPCs, the transport is responsible
    50  // for ordering replicas in accordance with SendOptions.Ordering and
    51  // transport-specific knowledge such as connection health or latency.
    52  //
    53  // TODO(bdarnell): clean up this crufty interface; it was extracted
    54  // verbatim from the non-abstracted code.
    55  type TransportFactory func(
    56  	SendOptions, *nodedialer.Dialer, ReplicaSlice,
    57  ) (Transport, error)
    58  
    59  // Transport objects can send RPCs to one or more replicas of a range.
    60  // All calls to Transport methods are made from a single thread, so
    61  // Transports are not required to be thread-safe.
    62  type Transport interface {
    63  	// IsExhausted returns true if there are no more replicas to try.
    64  	IsExhausted() bool
    65  
    66  	// SendNext synchronously sends the BatchRequest rpc to the next replica.
    67  	// May panic if the transport is exhausted.
    68  	//
    69  	// SendNext is also in charge of importing the remotely collected spans (if
    70  	// any) into the local trace.
    71  	SendNext(context.Context, roachpb.BatchRequest) (*roachpb.BatchResponse, error)
    72  
    73  	// NextInternalClient returns the InternalClient to use for making RPC
    74  	// calls. Returns a context.Context which should be used when making RPC
    75  	// calls on the returned server (This context is annotated to mark this
    76  	// request as in-process and bypass ctx.Peer checks).
    77  	NextInternalClient(context.Context) (context.Context, roachpb.InternalClient, error)
    78  
    79  	// NextReplica returns the replica descriptor of the replica to be tried in
    80  	// the next call to SendNext. MoveToFront will cause the return value to
    81  	// change. Returns a zero value if the transport is exhausted.
    82  	NextReplica() roachpb.ReplicaDescriptor
    83  
    84  	// MoveToFront locates the specified replica and moves it to the
    85  	// front of the ordering of replicas to try. If the replica has
    86  	// already been tried, it will be retried. If the specified replica
    87  	// can't be found, this is a noop.
    88  	MoveToFront(roachpb.ReplicaDescriptor)
    89  }
    90  
    91  // grpcTransportFactoryImpl is the default TransportFactory, using GRPC.
    92  // Do not use this directly - use grpcTransportFactory instead.
    93  //
    94  // During race builds, we wrap this to hold on to and read all obtained
    95  // requests in a tight loop, exposing data races; see transport_race.go.
    96  func grpcTransportFactoryImpl(
    97  	opts SendOptions, nodeDialer *nodedialer.Dialer, replicas ReplicaSlice,
    98  ) (Transport, error) {
    99  	clients := make([]batchClient, 0, len(replicas))
   100  	for _, replica := range replicas {
   101  		healthy := nodeDialer.ConnHealth(replica.NodeID, opts.class) == nil
   102  		clients = append(clients, batchClient{
   103  			replica: replica.ReplicaDescriptor,
   104  			healthy: healthy,
   105  		})
   106  	}
   107  
   108  	// Put known-healthy clients first.
   109  	splitHealthy(clients)
   110  
   111  	return &grpcTransport{
   112  		opts:           opts,
   113  		nodeDialer:     nodeDialer,
   114  		class:          opts.class,
   115  		orderedClients: clients,
   116  	}, nil
   117  }
   118  
   119  type grpcTransport struct {
   120  	opts           SendOptions
   121  	nodeDialer     *nodedialer.Dialer
   122  	class          rpc.ConnectionClass
   123  	clientIndex    int
   124  	orderedClients []batchClient
   125  }
   126  
   127  // IsExhausted returns false if there are any untried replicas remaining. If
   128  // there are none, it attempts to resurrect replicas which were tried but
   129  // failed with a retryable error. If any where resurrected, returns false;
   130  // true otherwise.
   131  func (gt *grpcTransport) IsExhausted() bool {
   132  	if gt.clientIndex < len(gt.orderedClients) {
   133  		return false
   134  	}
   135  	return !gt.maybeResurrectRetryablesLocked()
   136  }
   137  
   138  // maybeResurrectRetryablesLocked moves already-tried replicas which
   139  // experienced a retryable error (currently this means a
   140  // NotLeaseHolderError) into a newly-active state so that they can be
   141  // retried. Returns true if any replicas were moved to active.
   142  func (gt *grpcTransport) maybeResurrectRetryablesLocked() bool {
   143  	var resurrect []batchClient
   144  	for i := 0; i < gt.clientIndex; i++ {
   145  		if c := gt.orderedClients[i]; c.retryable && timeutil.Since(c.deadline) >= 0 {
   146  			resurrect = append(resurrect, c)
   147  		}
   148  	}
   149  	for _, c := range resurrect {
   150  		gt.moveToFrontLocked(c.replica)
   151  	}
   152  	return len(resurrect) > 0
   153  }
   154  
   155  // SendNext invokes the specified RPC on the supplied client when the
   156  // client is ready. On success, the reply is sent on the channel;
   157  // otherwise an error is sent.
   158  func (gt *grpcTransport) SendNext(
   159  	ctx context.Context, ba roachpb.BatchRequest,
   160  ) (*roachpb.BatchResponse, error) {
   161  	client := gt.orderedClients[gt.clientIndex]
   162  	ctx, iface, err := gt.NextInternalClient(ctx)
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  
   167  	ba.Replica = client.replica
   168  	reply, err := gt.sendBatch(ctx, client.replica.NodeID, iface, ba)
   169  
   170  	// NotLeaseHolderErrors can be retried.
   171  	var retryable bool
   172  	if reply != nil && reply.Error != nil {
   173  		// TODO(spencer): pass the lease expiration when setting the state
   174  		// to set a more efficient deadline for retrying this replica.
   175  		if _, ok := reply.Error.GetDetail().(*roachpb.NotLeaseHolderError); ok {
   176  			retryable = true
   177  		}
   178  	}
   179  	gt.setState(client.replica, retryable)
   180  
   181  	return reply, err
   182  }
   183  
   184  // NB: nodeID is unused, but accessible in stack traces.
   185  func (gt *grpcTransport) sendBatch(
   186  	ctx context.Context, nodeID roachpb.NodeID, iface roachpb.InternalClient, ba roachpb.BatchRequest,
   187  ) (*roachpb.BatchResponse, error) {
   188  	// Bail out early if the context is already canceled. (GRPC will
   189  	// detect this pretty quickly, but the first check of the context
   190  	// in the local server comes pretty late)
   191  	if ctx.Err() != nil {
   192  		return nil, errors.Wrap(ctx.Err(), "aborted before batch send")
   193  	}
   194  
   195  	gt.opts.metrics.SentCount.Inc(1)
   196  	if rpc.IsLocal(iface) {
   197  		gt.opts.metrics.LocalSentCount.Inc(1)
   198  	}
   199  	reply, err := iface.Batch(ctx, &ba)
   200  	// If we queried a remote node, perform extra validation and
   201  	// import trace spans.
   202  	if reply != nil && !rpc.IsLocal(iface) {
   203  		for i := range reply.Responses {
   204  			if err := reply.Responses[i].GetInner().Verify(ba.Requests[i].GetInner()); err != nil {
   205  				log.Errorf(ctx, "%v", err)
   206  			}
   207  		}
   208  		// Import the remotely collected spans, if any.
   209  		if len(reply.CollectedSpans) != 0 {
   210  			span := opentracing.SpanFromContext(ctx)
   211  			if span == nil {
   212  				return nil, errors.Errorf(
   213  					"trying to ingest remote spans but there is no recording span set up")
   214  			}
   215  			if err := tracing.ImportRemoteSpans(span, reply.CollectedSpans); err != nil {
   216  				return nil, errors.Wrap(err, "error ingesting remote spans")
   217  			}
   218  		}
   219  	}
   220  	return reply, err
   221  }
   222  
   223  // NextInternalClient returns the next InternalClient to use for performing
   224  // RPCs.
   225  func (gt *grpcTransport) NextInternalClient(
   226  	ctx context.Context,
   227  ) (context.Context, roachpb.InternalClient, error) {
   228  	client := gt.orderedClients[gt.clientIndex]
   229  	gt.clientIndex++
   230  	return gt.nodeDialer.DialInternalClient(ctx, client.replica.NodeID, gt.class)
   231  }
   232  
   233  func (gt *grpcTransport) NextReplica() roachpb.ReplicaDescriptor {
   234  	if gt.IsExhausted() {
   235  		return roachpb.ReplicaDescriptor{}
   236  	}
   237  	return gt.orderedClients[gt.clientIndex].replica
   238  }
   239  
   240  func (gt *grpcTransport) MoveToFront(replica roachpb.ReplicaDescriptor) {
   241  	gt.moveToFrontLocked(replica)
   242  }
   243  
   244  func (gt *grpcTransport) moveToFrontLocked(replica roachpb.ReplicaDescriptor) {
   245  	for i := range gt.orderedClients {
   246  		if gt.orderedClients[i].replica == replica {
   247  			// Clear the retryable bit as this replica is being made
   248  			// available.
   249  			gt.orderedClients[i].retryable = false
   250  			gt.orderedClients[i].deadline = time.Time{}
   251  			// If we've already processed the replica, decrement the current
   252  			// index before we swap.
   253  			if i < gt.clientIndex {
   254  				gt.clientIndex--
   255  			}
   256  			// Swap the client representing this replica to the front.
   257  			gt.orderedClients[i], gt.orderedClients[gt.clientIndex] =
   258  				gt.orderedClients[gt.clientIndex], gt.orderedClients[i]
   259  			return
   260  		}
   261  	}
   262  }
   263  
   264  // NB: this method's callers may have a reference to the client they wish to
   265  // mutate, but the clients reside in a slice which is shuffled via
   266  // MoveToFront, making it unsafe to mutate the client through a reference to
   267  // the slice.
   268  func (gt *grpcTransport) setState(replica roachpb.ReplicaDescriptor, retryable bool) {
   269  	for i := range gt.orderedClients {
   270  		if gt.orderedClients[i].replica == replica {
   271  			gt.orderedClients[i].retryable = retryable
   272  			if retryable {
   273  				gt.orderedClients[i].deadline = timeutil.Now().Add(time.Second)
   274  			}
   275  			break
   276  		}
   277  	}
   278  }
   279  
   280  // splitHealthy splits the provided client slice into healthy clients and
   281  // unhealthy clients, based on their connection state. Healthy clients will
   282  // be rearranged first in the slice, and unhealthy clients will be rearranged
   283  // last. Within these two groups, the rearrangement will be stable. The function
   284  // will then return the number of healthy clients.
   285  func splitHealthy(clients []batchClient) int {
   286  	var nHealthy int
   287  	sort.Stable(byHealth(clients))
   288  	for _, client := range clients {
   289  		if client.healthy {
   290  			nHealthy++
   291  		}
   292  	}
   293  	return nHealthy
   294  }
   295  
   296  // byHealth sorts a slice of batchClients by their health with healthy first.
   297  type byHealth []batchClient
   298  
   299  func (h byHealth) Len() int           { return len(h) }
   300  func (h byHealth) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
   301  func (h byHealth) Less(i, j int) bool { return h[i].healthy && !h[j].healthy }
   302  
   303  // SenderTransportFactory wraps a client.Sender for use as a KV
   304  // Transport. This is useful for tests that want to use DistSender
   305  // without a full RPC stack.
   306  func SenderTransportFactory(tracer opentracing.Tracer, sender kv.Sender) TransportFactory {
   307  	return func(
   308  		_ SendOptions, _ *nodedialer.Dialer, replicas ReplicaSlice,
   309  	) (Transport, error) {
   310  		// Always send to the first replica.
   311  		replica := replicas[0].ReplicaDescriptor
   312  		return &senderTransport{tracer, sender, replica, false}, nil
   313  	}
   314  }
   315  
   316  type senderTransport struct {
   317  	tracer  opentracing.Tracer
   318  	sender  kv.Sender
   319  	replica roachpb.ReplicaDescriptor
   320  
   321  	called bool
   322  }
   323  
   324  func (s *senderTransport) IsExhausted() bool {
   325  	return s.called
   326  }
   327  
   328  func (s *senderTransport) SendNext(
   329  	ctx context.Context, ba roachpb.BatchRequest,
   330  ) (*roachpb.BatchResponse, error) {
   331  	if s.called {
   332  		panic("called an exhausted transport")
   333  	}
   334  	s.called = true
   335  
   336  	ctx, cleanup := tracing.EnsureContext(ctx, s.tracer, "node" /* name */)
   337  	defer cleanup()
   338  
   339  	ba.Replica = s.replica
   340  	log.Eventf(ctx, "%v", ba.String())
   341  	br, pErr := s.sender.Send(ctx, ba)
   342  	if br == nil {
   343  		br = &roachpb.BatchResponse{}
   344  	}
   345  	if br.Error != nil {
   346  		panic(roachpb.ErrorUnexpectedlySet(s.sender, br))
   347  	}
   348  	br.Error = pErr
   349  	if pErr != nil {
   350  		log.Eventf(ctx, "error: %v", pErr.String())
   351  	}
   352  
   353  	// Import the remotely collected spans, if any.
   354  	if len(br.CollectedSpans) != 0 {
   355  		span := opentracing.SpanFromContext(ctx)
   356  		if span == nil {
   357  			panic("trying to ingest remote spans but there is no recording span set up")
   358  		}
   359  		if err := tracing.ImportRemoteSpans(span, br.CollectedSpans); err != nil {
   360  			panic(err)
   361  		}
   362  	}
   363  
   364  	return br, nil
   365  }
   366  
   367  func (s *senderTransport) NextInternalClient(
   368  	ctx context.Context,
   369  ) (context.Context, roachpb.InternalClient, error) {
   370  	panic("unimplemented")
   371  }
   372  
   373  func (s *senderTransport) NextReplica() roachpb.ReplicaDescriptor {
   374  	if s.IsExhausted() {
   375  		return roachpb.ReplicaDescriptor{}
   376  	}
   377  	return s.replica
   378  }
   379  
   380  func (s *senderTransport) MoveToFront(replica roachpb.ReplicaDescriptor) {
   381  }