github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/transport.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvcoord 12 13 import ( 14 "context" 15 "sort" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/kv" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/rpc" 21 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 24 "github.com/cockroachdb/cockroach/pkg/util/tracing" 25 "github.com/cockroachdb/errors" 26 opentracing "github.com/opentracing/opentracing-go" 27 ) 28 29 // A SendOptions structure describes the algorithm for sending RPCs to one or 30 // more replicas, depending on error conditions and how many successful 31 // responses are required. 32 type SendOptions struct { 33 class rpc.ConnectionClass 34 metrics *DistSenderMetrics 35 } 36 37 type batchClient struct { 38 replica roachpb.ReplicaDescriptor 39 healthy bool 40 retryable bool 41 deadline time.Time 42 } 43 44 // TransportFactory encapsulates all interaction with the RPC 45 // subsystem, allowing it to be mocked out for testing. The factory 46 // function returns a Transport object which is used to send requests 47 // to one or more replicas in the slice. 48 // 49 // In addition to actually sending RPCs, the transport is responsible 50 // for ordering replicas in accordance with SendOptions.Ordering and 51 // transport-specific knowledge such as connection health or latency. 52 // 53 // TODO(bdarnell): clean up this crufty interface; it was extracted 54 // verbatim from the non-abstracted code. 55 type TransportFactory func( 56 SendOptions, *nodedialer.Dialer, ReplicaSlice, 57 ) (Transport, error) 58 59 // Transport objects can send RPCs to one or more replicas of a range. 60 // All calls to Transport methods are made from a single thread, so 61 // Transports are not required to be thread-safe. 62 type Transport interface { 63 // IsExhausted returns true if there are no more replicas to try. 64 IsExhausted() bool 65 66 // SendNext synchronously sends the BatchRequest rpc to the next replica. 67 // May panic if the transport is exhausted. 68 // 69 // SendNext is also in charge of importing the remotely collected spans (if 70 // any) into the local trace. 71 SendNext(context.Context, roachpb.BatchRequest) (*roachpb.BatchResponse, error) 72 73 // NextInternalClient returns the InternalClient to use for making RPC 74 // calls. Returns a context.Context which should be used when making RPC 75 // calls on the returned server (This context is annotated to mark this 76 // request as in-process and bypass ctx.Peer checks). 77 NextInternalClient(context.Context) (context.Context, roachpb.InternalClient, error) 78 79 // NextReplica returns the replica descriptor of the replica to be tried in 80 // the next call to SendNext. MoveToFront will cause the return value to 81 // change. Returns a zero value if the transport is exhausted. 82 NextReplica() roachpb.ReplicaDescriptor 83 84 // MoveToFront locates the specified replica and moves it to the 85 // front of the ordering of replicas to try. If the replica has 86 // already been tried, it will be retried. If the specified replica 87 // can't be found, this is a noop. 88 MoveToFront(roachpb.ReplicaDescriptor) 89 } 90 91 // grpcTransportFactoryImpl is the default TransportFactory, using GRPC. 92 // Do not use this directly - use grpcTransportFactory instead. 93 // 94 // During race builds, we wrap this to hold on to and read all obtained 95 // requests in a tight loop, exposing data races; see transport_race.go. 96 func grpcTransportFactoryImpl( 97 opts SendOptions, nodeDialer *nodedialer.Dialer, replicas ReplicaSlice, 98 ) (Transport, error) { 99 clients := make([]batchClient, 0, len(replicas)) 100 for _, replica := range replicas { 101 healthy := nodeDialer.ConnHealth(replica.NodeID, opts.class) == nil 102 clients = append(clients, batchClient{ 103 replica: replica.ReplicaDescriptor, 104 healthy: healthy, 105 }) 106 } 107 108 // Put known-healthy clients first. 109 splitHealthy(clients) 110 111 return &grpcTransport{ 112 opts: opts, 113 nodeDialer: nodeDialer, 114 class: opts.class, 115 orderedClients: clients, 116 }, nil 117 } 118 119 type grpcTransport struct { 120 opts SendOptions 121 nodeDialer *nodedialer.Dialer 122 class rpc.ConnectionClass 123 clientIndex int 124 orderedClients []batchClient 125 } 126 127 // IsExhausted returns false if there are any untried replicas remaining. If 128 // there are none, it attempts to resurrect replicas which were tried but 129 // failed with a retryable error. If any where resurrected, returns false; 130 // true otherwise. 131 func (gt *grpcTransport) IsExhausted() bool { 132 if gt.clientIndex < len(gt.orderedClients) { 133 return false 134 } 135 return !gt.maybeResurrectRetryablesLocked() 136 } 137 138 // maybeResurrectRetryablesLocked moves already-tried replicas which 139 // experienced a retryable error (currently this means a 140 // NotLeaseHolderError) into a newly-active state so that they can be 141 // retried. Returns true if any replicas were moved to active. 142 func (gt *grpcTransport) maybeResurrectRetryablesLocked() bool { 143 var resurrect []batchClient 144 for i := 0; i < gt.clientIndex; i++ { 145 if c := gt.orderedClients[i]; c.retryable && timeutil.Since(c.deadline) >= 0 { 146 resurrect = append(resurrect, c) 147 } 148 } 149 for _, c := range resurrect { 150 gt.moveToFrontLocked(c.replica) 151 } 152 return len(resurrect) > 0 153 } 154 155 // SendNext invokes the specified RPC on the supplied client when the 156 // client is ready. On success, the reply is sent on the channel; 157 // otherwise an error is sent. 158 func (gt *grpcTransport) SendNext( 159 ctx context.Context, ba roachpb.BatchRequest, 160 ) (*roachpb.BatchResponse, error) { 161 client := gt.orderedClients[gt.clientIndex] 162 ctx, iface, err := gt.NextInternalClient(ctx) 163 if err != nil { 164 return nil, err 165 } 166 167 ba.Replica = client.replica 168 reply, err := gt.sendBatch(ctx, client.replica.NodeID, iface, ba) 169 170 // NotLeaseHolderErrors can be retried. 171 var retryable bool 172 if reply != nil && reply.Error != nil { 173 // TODO(spencer): pass the lease expiration when setting the state 174 // to set a more efficient deadline for retrying this replica. 175 if _, ok := reply.Error.GetDetail().(*roachpb.NotLeaseHolderError); ok { 176 retryable = true 177 } 178 } 179 gt.setState(client.replica, retryable) 180 181 return reply, err 182 } 183 184 // NB: nodeID is unused, but accessible in stack traces. 185 func (gt *grpcTransport) sendBatch( 186 ctx context.Context, nodeID roachpb.NodeID, iface roachpb.InternalClient, ba roachpb.BatchRequest, 187 ) (*roachpb.BatchResponse, error) { 188 // Bail out early if the context is already canceled. (GRPC will 189 // detect this pretty quickly, but the first check of the context 190 // in the local server comes pretty late) 191 if ctx.Err() != nil { 192 return nil, errors.Wrap(ctx.Err(), "aborted before batch send") 193 } 194 195 gt.opts.metrics.SentCount.Inc(1) 196 if rpc.IsLocal(iface) { 197 gt.opts.metrics.LocalSentCount.Inc(1) 198 } 199 reply, err := iface.Batch(ctx, &ba) 200 // If we queried a remote node, perform extra validation and 201 // import trace spans. 202 if reply != nil && !rpc.IsLocal(iface) { 203 for i := range reply.Responses { 204 if err := reply.Responses[i].GetInner().Verify(ba.Requests[i].GetInner()); err != nil { 205 log.Errorf(ctx, "%v", err) 206 } 207 } 208 // Import the remotely collected spans, if any. 209 if len(reply.CollectedSpans) != 0 { 210 span := opentracing.SpanFromContext(ctx) 211 if span == nil { 212 return nil, errors.Errorf( 213 "trying to ingest remote spans but there is no recording span set up") 214 } 215 if err := tracing.ImportRemoteSpans(span, reply.CollectedSpans); err != nil { 216 return nil, errors.Wrap(err, "error ingesting remote spans") 217 } 218 } 219 } 220 return reply, err 221 } 222 223 // NextInternalClient returns the next InternalClient to use for performing 224 // RPCs. 225 func (gt *grpcTransport) NextInternalClient( 226 ctx context.Context, 227 ) (context.Context, roachpb.InternalClient, error) { 228 client := gt.orderedClients[gt.clientIndex] 229 gt.clientIndex++ 230 return gt.nodeDialer.DialInternalClient(ctx, client.replica.NodeID, gt.class) 231 } 232 233 func (gt *grpcTransport) NextReplica() roachpb.ReplicaDescriptor { 234 if gt.IsExhausted() { 235 return roachpb.ReplicaDescriptor{} 236 } 237 return gt.orderedClients[gt.clientIndex].replica 238 } 239 240 func (gt *grpcTransport) MoveToFront(replica roachpb.ReplicaDescriptor) { 241 gt.moveToFrontLocked(replica) 242 } 243 244 func (gt *grpcTransport) moveToFrontLocked(replica roachpb.ReplicaDescriptor) { 245 for i := range gt.orderedClients { 246 if gt.orderedClients[i].replica == replica { 247 // Clear the retryable bit as this replica is being made 248 // available. 249 gt.orderedClients[i].retryable = false 250 gt.orderedClients[i].deadline = time.Time{} 251 // If we've already processed the replica, decrement the current 252 // index before we swap. 253 if i < gt.clientIndex { 254 gt.clientIndex-- 255 } 256 // Swap the client representing this replica to the front. 257 gt.orderedClients[i], gt.orderedClients[gt.clientIndex] = 258 gt.orderedClients[gt.clientIndex], gt.orderedClients[i] 259 return 260 } 261 } 262 } 263 264 // NB: this method's callers may have a reference to the client they wish to 265 // mutate, but the clients reside in a slice which is shuffled via 266 // MoveToFront, making it unsafe to mutate the client through a reference to 267 // the slice. 268 func (gt *grpcTransport) setState(replica roachpb.ReplicaDescriptor, retryable bool) { 269 for i := range gt.orderedClients { 270 if gt.orderedClients[i].replica == replica { 271 gt.orderedClients[i].retryable = retryable 272 if retryable { 273 gt.orderedClients[i].deadline = timeutil.Now().Add(time.Second) 274 } 275 break 276 } 277 } 278 } 279 280 // splitHealthy splits the provided client slice into healthy clients and 281 // unhealthy clients, based on their connection state. Healthy clients will 282 // be rearranged first in the slice, and unhealthy clients will be rearranged 283 // last. Within these two groups, the rearrangement will be stable. The function 284 // will then return the number of healthy clients. 285 func splitHealthy(clients []batchClient) int { 286 var nHealthy int 287 sort.Stable(byHealth(clients)) 288 for _, client := range clients { 289 if client.healthy { 290 nHealthy++ 291 } 292 } 293 return nHealthy 294 } 295 296 // byHealth sorts a slice of batchClients by their health with healthy first. 297 type byHealth []batchClient 298 299 func (h byHealth) Len() int { return len(h) } 300 func (h byHealth) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 301 func (h byHealth) Less(i, j int) bool { return h[i].healthy && !h[j].healthy } 302 303 // SenderTransportFactory wraps a client.Sender for use as a KV 304 // Transport. This is useful for tests that want to use DistSender 305 // without a full RPC stack. 306 func SenderTransportFactory(tracer opentracing.Tracer, sender kv.Sender) TransportFactory { 307 return func( 308 _ SendOptions, _ *nodedialer.Dialer, replicas ReplicaSlice, 309 ) (Transport, error) { 310 // Always send to the first replica. 311 replica := replicas[0].ReplicaDescriptor 312 return &senderTransport{tracer, sender, replica, false}, nil 313 } 314 } 315 316 type senderTransport struct { 317 tracer opentracing.Tracer 318 sender kv.Sender 319 replica roachpb.ReplicaDescriptor 320 321 called bool 322 } 323 324 func (s *senderTransport) IsExhausted() bool { 325 return s.called 326 } 327 328 func (s *senderTransport) SendNext( 329 ctx context.Context, ba roachpb.BatchRequest, 330 ) (*roachpb.BatchResponse, error) { 331 if s.called { 332 panic("called an exhausted transport") 333 } 334 s.called = true 335 336 ctx, cleanup := tracing.EnsureContext(ctx, s.tracer, "node" /* name */) 337 defer cleanup() 338 339 ba.Replica = s.replica 340 log.Eventf(ctx, "%v", ba.String()) 341 br, pErr := s.sender.Send(ctx, ba) 342 if br == nil { 343 br = &roachpb.BatchResponse{} 344 } 345 if br.Error != nil { 346 panic(roachpb.ErrorUnexpectedlySet(s.sender, br)) 347 } 348 br.Error = pErr 349 if pErr != nil { 350 log.Eventf(ctx, "error: %v", pErr.String()) 351 } 352 353 // Import the remotely collected spans, if any. 354 if len(br.CollectedSpans) != 0 { 355 span := opentracing.SpanFromContext(ctx) 356 if span == nil { 357 panic("trying to ingest remote spans but there is no recording span set up") 358 } 359 if err := tracing.ImportRemoteSpans(span, br.CollectedSpans); err != nil { 360 panic(err) 361 } 362 } 363 364 return br, nil 365 } 366 367 func (s *senderTransport) NextInternalClient( 368 ctx context.Context, 369 ) (context.Context, roachpb.InternalClient, error) { 370 panic("unimplemented") 371 } 372 373 func (s *senderTransport) NextReplica() roachpb.ReplicaDescriptor { 374 if s.IsExhausted() { 375 return roachpb.ReplicaDescriptor{} 376 } 377 return s.replica 378 } 379 380 func (s *senderTransport) MoveToFront(replica roachpb.ReplicaDescriptor) { 381 }