github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/transport_race.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // +build race
    12  
    13  package kvcoord
    14  
    15  import (
    16  	"context"
    17  	"encoding/json"
    18  	"io/ioutil"
    19  	"math/rand"
    20  	"reflect"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  )
    29  
    30  var running int32 // atomically updated
    31  var incoming chan *roachpb.BatchRequest
    32  
    33  func init() {
    34  	incoming = make(chan *roachpb.BatchRequest, 100)
    35  }
    36  
    37  const defaultRaceInterval = 150 * time.Microsecond
    38  
    39  func jitter(avgInterval time.Duration) time.Duration {
    40  	// Use defaultRaceInterval as a minimum to limit how much time
    41  	// we spend here.
    42  	if avgInterval < defaultRaceInterval {
    43  		avgInterval = defaultRaceInterval
    44  	}
    45  	return time.Duration(rand.Int63n(int64(2 * avgInterval)))
    46  }
    47  
    48  // raceTransport wrap a Transport implementation and intercepts all
    49  // BatchRequests, sending them to the transport racer task to read
    50  // them asynchronously in a tight loop.
    51  type raceTransport struct {
    52  	Transport
    53  }
    54  
    55  func (tr raceTransport) SendNext(
    56  	ctx context.Context, ba roachpb.BatchRequest,
    57  ) (*roachpb.BatchResponse, error) {
    58  	// Make a copy of the requests slice, and shallow copies of the requests.
    59  	// The caller is allowed to mutate the request after the call returns. Since
    60  	// this transport has no way of checking who's doing mutations (the client -
    61  	// which is allowed, or the server - which is not). So, for now, we exclude
    62  	// the slice and the requests from any checks, since those are the parts that
    63  	// the client currently mutates.
    64  	requestsCopy := make([]roachpb.RequestUnion, len(ba.Requests))
    65  	for i, ru := range ba.Requests {
    66  		// ru is a RequestUnion interface, so we need some hoops to dereference it.
    67  		requestsCopy[i] = reflect.Indirect(reflect.ValueOf(ru)).Interface().(roachpb.RequestUnion)
    68  	}
    69  	ba.Requests = requestsCopy
    70  	select {
    71  	// We have a shallow copy here and so the top level scalar fields can't
    72  	// really race, but making more copies doesn't make anything more
    73  	// transparent, so from now on we operate on a pointer.
    74  	case incoming <- &ba:
    75  	default:
    76  		// Avoid slowing down the tests if we're backed up.
    77  	}
    78  	return tr.Transport.SendNext(ctx, ba)
    79  }
    80  
    81  // GRPCTransportFactory during race builds wraps the implementation and
    82  // intercepts all BatchRequests, reading them asynchronously in a tight loop.
    83  // This allows the race detector to catch any mutations of a batch passed to the
    84  // transport. The dealio is that batches passed to the transport are immutable -
    85  // the server is not allowed to mutate anything and this transport makes sure
    86  // they don't. See client.Sender() for more.
    87  //
    88  // NOTE(andrei): We don't like this transport very much. It's slow, preventing
    89  // us from running clusters with race binaries and, the way it's written, it
    90  // prevents both the client and the server from mutating the BatchRequest. But
    91  // only the server is prohibited (according to the client.Sender interface). In
    92  // fact, we'd like to have the client reuse these requests and mutate them.
    93  // Instead of this transport, we should find other mechanisms ensuring that:
    94  // a) the server doesn't hold on to any memory, and
    95  // b) the server doesn't mutate the request
    96  func GRPCTransportFactory(
    97  	opts SendOptions, nodeDialer *nodedialer.Dialer, replicas ReplicaSlice,
    98  ) (Transport, error) {
    99  	if atomic.AddInt32(&running, 1) <= 1 {
   100  		// NB: We can't use Stopper.RunWorker because doing so would race with
   101  		// calling Stopper.Stop.
   102  		if err := nodeDialer.Stopper().RunAsyncTask(
   103  			context.TODO(), "transport racer", func(ctx context.Context) {
   104  				var iters int
   105  				var curIdx int
   106  				defer func() {
   107  					atomic.StoreInt32(&running, 0)
   108  					log.Infof(
   109  						ctx,
   110  						"transport race promotion: ran %d iterations on up to %d requests",
   111  						iters, curIdx+1,
   112  					)
   113  				}()
   114  				// Make a fixed-size slice of *BatchRequest. When full, entries
   115  				// are evicted in FIFO order.
   116  				const size = 1000
   117  				bas := make([]*roachpb.BatchRequest, size)
   118  				encoder := json.NewEncoder(ioutil.Discard)
   119  				for {
   120  					iters++
   121  					start := timeutil.Now()
   122  					for _, ba := range bas {
   123  						if ba != nil {
   124  							if err := encoder.Encode(ba); err != nil {
   125  								panic(err)
   126  							}
   127  						}
   128  					}
   129  					// Prevent the goroutine from spinning too hot as this lets CI
   130  					// times skyrocket. Sleep on average for as long as we worked
   131  					// on the last iteration so we spend no more than half our CPU
   132  					// time on this task.
   133  					jittered := time.After(jitter(timeutil.Since(start)))
   134  					// Collect incoming requests until the jittered timer fires,
   135  					// then access everything we have.
   136  					for {
   137  						select {
   138  						case <-nodeDialer.Stopper().ShouldQuiesce():
   139  							return
   140  						case ba := <-incoming:
   141  							bas[curIdx%size] = ba
   142  							curIdx++
   143  							continue
   144  						case <-jittered:
   145  						}
   146  						break
   147  					}
   148  				}
   149  			}); err != nil {
   150  			// Failed to start async task, reset our state.
   151  			atomic.StoreInt32(&running, 0)
   152  		}
   153  	}
   154  
   155  	t, err := grpcTransportFactoryImpl(opts, nodeDialer, replicas)
   156  	if err != nil {
   157  		return nil, err
   158  	}
   159  	return &raceTransport{Transport: t}, nil
   160  }