github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/rpc/nodedialer/nodedialer.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package nodedialer 12 13 import ( 14 "context" 15 "fmt" 16 "net" 17 "time" 18 "unsafe" 19 20 circuit "github.com/cockroachdb/circuitbreaker" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/rpc" 25 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/stop" 28 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 29 "github.com/cockroachdb/errors" 30 "google.golang.org/grpc" 31 ) 32 33 // No more than one failure to connect to a given node will be logged in the given interval. 34 const logPerNodeFailInterval = time.Minute 35 36 type wrappedBreaker struct { 37 *circuit.Breaker 38 log.EveryN 39 } 40 41 // An AddressResolver translates NodeIDs into addresses. 42 type AddressResolver func(roachpb.NodeID) (net.Addr, error) 43 44 // A Dialer wraps an *rpc.Context for dialing based on node IDs. For each node, 45 // it maintains a circuit breaker that prevents rapid connection attempts and 46 // provides hints to the callers on whether to log the outcome of the operation. 47 type Dialer struct { 48 rpcContext *rpc.Context 49 resolver AddressResolver 50 51 breakers [rpc.NumConnectionClasses]syncutil.IntMap // map[roachpb.NodeID]*wrappedBreaker 52 } 53 54 // New initializes a Dialer. 55 func New(rpcContext *rpc.Context, resolver AddressResolver) *Dialer { 56 return &Dialer{ 57 rpcContext: rpcContext, 58 resolver: resolver, 59 } 60 } 61 62 // Stopper returns this node dialer's Stopper. 63 // TODO(bdarnell): This is a bit of a hack for kv/transport_race.go 64 func (n *Dialer) Stopper() *stop.Stopper { 65 return n.rpcContext.Stopper 66 } 67 68 // Silence lint warning because this method is only used in race builds. 69 var _ = (*Dialer).Stopper 70 71 // Dial returns a grpc connection to the given node. It logs whenever the 72 // node first becomes unreachable or reachable. 73 func (n *Dialer) Dial( 74 ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass, 75 ) (_ *grpc.ClientConn, err error) { 76 if n == nil || n.resolver == nil { 77 return nil, errors.New("no node dialer configured") 78 } 79 // Don't trip the breaker if we're already canceled. 80 if ctxErr := ctx.Err(); ctxErr != nil { 81 return nil, ctxErr 82 } 83 breaker := n.getBreaker(nodeID, class) 84 addr, err := n.resolver(nodeID) 85 if err != nil { 86 err = errors.Wrapf(err, "failed to resolve n%d", nodeID) 87 breaker.Fail(err) 88 return nil, err 89 } 90 return n.dial(ctx, nodeID, addr, breaker, class) 91 } 92 93 // DialNoBreaker ignores the breaker if there is an error dialing. This function 94 // should only be used when there is good reason to believe that the node is reachable. 95 func (n *Dialer) DialNoBreaker( 96 ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass, 97 ) (_ *grpc.ClientConn, err error) { 98 if n == nil || n.resolver == nil { 99 return nil, errors.New("no node dialer configured") 100 } 101 addr, err := n.resolver(nodeID) 102 if err != nil { 103 return nil, err 104 } 105 return n.dial(ctx, nodeID, addr, nil /* breaker */, class) 106 } 107 108 // DialInternalClient is a specialization of DialClass for callers that 109 // want a roachpb.InternalClient. This supports an optimization to bypass the 110 // network for the local node. Returns a context.Context which should be used 111 // when making RPC calls on the returned server. (This context is annotated to 112 // mark this request as in-process and bypass ctx.Peer checks). 113 func (n *Dialer) DialInternalClient( 114 ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass, 115 ) (context.Context, roachpb.InternalClient, error) { 116 if n == nil || n.resolver == nil { 117 return nil, nil, errors.New("no node dialer configured") 118 } 119 addr, err := n.resolver(nodeID) 120 if err != nil { 121 return nil, nil, err 122 } 123 if localClient := n.rpcContext.GetLocalInternalClientForAddr(addr.String(), nodeID); localClient != nil { 124 log.VEvent(ctx, 2, "sending request to local client") 125 126 // Create a new context from the existing one with the "local request" field set. 127 // This tells the handler that this is an in-process request, bypassing ctx.Peer checks. 128 localCtx := grpcutil.NewLocalRequestContext(ctx) 129 130 return localCtx, localClient, nil 131 } 132 log.VEventf(ctx, 2, "sending request to %s", addr) 133 conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), class) 134 if err != nil { 135 return nil, nil, err 136 } 137 return ctx, roachpb.NewInternalClient(conn), err 138 } 139 140 // dial performs the dialing of the remote connection. If breaker is nil, 141 // then perform this logic without using any breaker functionality. 142 func (n *Dialer) dial( 143 ctx context.Context, 144 nodeID roachpb.NodeID, 145 addr net.Addr, 146 breaker *wrappedBreaker, 147 class rpc.ConnectionClass, 148 ) (_ *grpc.ClientConn, err error) { 149 // Don't trip the breaker if we're already canceled. 150 if ctxErr := ctx.Err(); ctxErr != nil { 151 return nil, ctxErr 152 } 153 if breaker != nil && !breaker.Ready() { 154 err = errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID) 155 return nil, err 156 } 157 defer func() { 158 // Enforce a minimum interval between warnings for failed connections. 159 if err != nil && ctx.Err() == nil && breaker != nil && breaker.ShouldLog() { 160 log.Infof(ctx, "unable to connect to n%d: %s", nodeID, err) 161 } 162 }() 163 conn, err := n.rpcContext.GRPCDialNode(addr.String(), nodeID, class).Connect(ctx) 164 if err != nil { 165 // If we were canceled during the dial, don't trip the breaker. 166 if ctxErr := ctx.Err(); ctxErr != nil { 167 return nil, ctxErr 168 } 169 err = errors.Wrapf(err, "failed to connect to n%d at %v", nodeID, addr) 170 if breaker != nil { 171 breaker.Fail(err) 172 } 173 return nil, err 174 } 175 // Check to see if the connection is in the transient failure state. This can 176 // happen if the connection already existed, but a recent heartbeat has 177 // failed and we haven't yet torn down the connection. 178 err = grpcutil.ConnectionReady(conn) 179 if err := grpcutil.ConnectionReady(conn); err != nil { 180 err = errors.Wrapf(err, "failed to check for ready connection to n%d at %v", nodeID, addr) 181 if breaker != nil { 182 breaker.Fail(err) 183 } 184 return nil, err 185 } 186 187 // TODO(bdarnell): Reconcile the different health checks and circuit breaker 188 // behavior in this file. Note that this different behavior causes problems 189 // for higher-levels in the system. For example, DistSQL checks for 190 // ConnHealth when scheduling processors, but can then see attempts to send 191 // RPCs fail when dial fails due to an open breaker. Reset the breaker here 192 // as a stop-gap before the reconciliation occurs. 193 if breaker != nil { 194 breaker.Success() 195 } 196 return conn, nil 197 } 198 199 // ConnHealth returns nil if we have an open connection of the request 200 // class to the given node that succeeded on its most recent heartbeat. See the 201 // method of the same name on rpc.Context for more details. 202 func (n *Dialer) ConnHealth(nodeID roachpb.NodeID, class rpc.ConnectionClass) error { 203 if n == nil || n.resolver == nil { 204 return errors.New("no node dialer configured") 205 } 206 if !n.getBreaker(nodeID, class).Ready() { 207 return circuit.ErrBreakerOpen 208 } 209 addr, err := n.resolver(nodeID) 210 if err != nil { 211 return err 212 } 213 // TODO(bdarnell): GRPCDialNode should detect local addresses and return 214 // a dummy connection instead of requiring callers to do this check. 215 if n.rpcContext.GetLocalInternalClientForAddr(addr.String(), nodeID) != nil { 216 // The local client is always considered healthy. 217 return nil 218 } 219 conn := n.rpcContext.GRPCDialNode(addr.String(), nodeID, class) 220 return conn.Health() 221 } 222 223 // GetCircuitBreaker retrieves the circuit breaker for connections to the 224 // given node. The breaker should not be mutated as this affects all connections 225 // dialing to that node through this NodeDialer. 226 func (n *Dialer) GetCircuitBreaker( 227 nodeID roachpb.NodeID, class rpc.ConnectionClass, 228 ) *circuit.Breaker { 229 return n.getBreaker(nodeID, class).Breaker 230 } 231 232 func (n *Dialer) getBreaker(nodeID roachpb.NodeID, class rpc.ConnectionClass) *wrappedBreaker { 233 breakers := &n.breakers[class] 234 value, ok := breakers.Load(int64(nodeID)) 235 if !ok { 236 name := fmt.Sprintf("rpc %v [n%d]", n.rpcContext.Config.Addr, nodeID) 237 breaker := &wrappedBreaker{Breaker: n.rpcContext.NewBreaker(name), EveryN: log.Every(logPerNodeFailInterval)} 238 value, _ = breakers.LoadOrStore(int64(nodeID), unsafe.Pointer(breaker)) 239 } 240 return (*wrappedBreaker)(value) 241 } 242 243 type dialerAdapter Dialer 244 245 func (da *dialerAdapter) Ready(nodeID roachpb.NodeID) bool { 246 return (*Dialer)(da).GetCircuitBreaker(nodeID, rpc.DefaultClass).Ready() 247 } 248 249 func (da *dialerAdapter) Dial(ctx context.Context, nodeID roachpb.NodeID) (ctpb.Client, error) { 250 c, err := (*Dialer)(da).Dial(ctx, nodeID, rpc.DefaultClass) 251 if err != nil { 252 return nil, err 253 } 254 return ctpb.NewClosedTimestampClient(c).Get(ctx) 255 } 256 257 var _ closedts.Dialer = (*Dialer)(nil).CTDialer() 258 259 // CTDialer wraps the NodeDialer into a closedts.Dialer. 260 func (n *Dialer) CTDialer() closedts.Dialer { 261 return (*dialerAdapter)(n) 262 }