github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/transport_race.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // +build race 12 13 package kvcoord 14 15 import ( 16 "context" 17 "encoding/json" 18 "io/ioutil" 19 "math/rand" 20 "reflect" 21 "sync/atomic" 22 "time" 23 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 ) 29 30 var running int32 // atomically updated 31 var incoming chan *roachpb.BatchRequest 32 33 func init() { 34 incoming = make(chan *roachpb.BatchRequest, 100) 35 } 36 37 const defaultRaceInterval = 150 * time.Microsecond 38 39 func jitter(avgInterval time.Duration) time.Duration { 40 // Use defaultRaceInterval as a minimum to limit how much time 41 // we spend here. 42 if avgInterval < defaultRaceInterval { 43 avgInterval = defaultRaceInterval 44 } 45 return time.Duration(rand.Int63n(int64(2 * avgInterval))) 46 } 47 48 // raceTransport wrap a Transport implementation and intercepts all 49 // BatchRequests, sending them to the transport racer task to read 50 // them asynchronously in a tight loop. 51 type raceTransport struct { 52 Transport 53 } 54 55 func (tr raceTransport) SendNext( 56 ctx context.Context, ba roachpb.BatchRequest, 57 ) (*roachpb.BatchResponse, error) { 58 // Make a copy of the requests slice, and shallow copies of the requests. 59 // The caller is allowed to mutate the request after the call returns. Since 60 // this transport has no way of checking who's doing mutations (the client - 61 // which is allowed, or the server - which is not). So, for now, we exclude 62 // the slice and the requests from any checks, since those are the parts that 63 // the client currently mutates. 64 requestsCopy := make([]roachpb.RequestUnion, len(ba.Requests)) 65 for i, ru := range ba.Requests { 66 // ru is a RequestUnion interface, so we need some hoops to dereference it. 67 requestsCopy[i] = reflect.Indirect(reflect.ValueOf(ru)).Interface().(roachpb.RequestUnion) 68 } 69 ba.Requests = requestsCopy 70 select { 71 // We have a shallow copy here and so the top level scalar fields can't 72 // really race, but making more copies doesn't make anything more 73 // transparent, so from now on we operate on a pointer. 74 case incoming <- &ba: 75 default: 76 // Avoid slowing down the tests if we're backed up. 77 } 78 return tr.Transport.SendNext(ctx, ba) 79 } 80 81 // GRPCTransportFactory during race builds wraps the implementation and 82 // intercepts all BatchRequests, reading them asynchronously in a tight loop. 83 // This allows the race detector to catch any mutations of a batch passed to the 84 // transport. The dealio is that batches passed to the transport are immutable - 85 // the server is not allowed to mutate anything and this transport makes sure 86 // they don't. See client.Sender() for more. 87 // 88 // NOTE(andrei): We don't like this transport very much. It's slow, preventing 89 // us from running clusters with race binaries and, the way it's written, it 90 // prevents both the client and the server from mutating the BatchRequest. But 91 // only the server is prohibited (according to the client.Sender interface). In 92 // fact, we'd like to have the client reuse these requests and mutate them. 93 // Instead of this transport, we should find other mechanisms ensuring that: 94 // a) the server doesn't hold on to any memory, and 95 // b) the server doesn't mutate the request 96 func GRPCTransportFactory( 97 opts SendOptions, nodeDialer *nodedialer.Dialer, replicas ReplicaSlice, 98 ) (Transport, error) { 99 if atomic.AddInt32(&running, 1) <= 1 { 100 // NB: We can't use Stopper.RunWorker because doing so would race with 101 // calling Stopper.Stop. 102 if err := nodeDialer.Stopper().RunAsyncTask( 103 context.TODO(), "transport racer", func(ctx context.Context) { 104 var iters int 105 var curIdx int 106 defer func() { 107 atomic.StoreInt32(&running, 0) 108 log.Infof( 109 ctx, 110 "transport race promotion: ran %d iterations on up to %d requests", 111 iters, curIdx+1, 112 ) 113 }() 114 // Make a fixed-size slice of *BatchRequest. When full, entries 115 // are evicted in FIFO order. 116 const size = 1000 117 bas := make([]*roachpb.BatchRequest, size) 118 encoder := json.NewEncoder(ioutil.Discard) 119 for { 120 iters++ 121 start := timeutil.Now() 122 for _, ba := range bas { 123 if ba != nil { 124 if err := encoder.Encode(ba); err != nil { 125 panic(err) 126 } 127 } 128 } 129 // Prevent the goroutine from spinning too hot as this lets CI 130 // times skyrocket. Sleep on average for as long as we worked 131 // on the last iteration so we spend no more than half our CPU 132 // time on this task. 133 jittered := time.After(jitter(timeutil.Since(start))) 134 // Collect incoming requests until the jittered timer fires, 135 // then access everything we have. 136 for { 137 select { 138 case <-nodeDialer.Stopper().ShouldQuiesce(): 139 return 140 case ba := <-incoming: 141 bas[curIdx%size] = ba 142 curIdx++ 143 continue 144 case <-jittered: 145 } 146 break 147 } 148 } 149 }); err != nil { 150 // Failed to start async task, reset our state. 151 atomic.StoreInt32(&running, 0) 152 } 153 } 154 155 t, err := grpcTransportFactoryImpl(opts, nodeDialer, replicas) 156 if err != nil { 157 return nil, err 158 } 159 return &raceTransport{Transport: t}, nil 160 }