github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/access/rpc/connection/manager.go (about) 1 package connection 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "time" 8 9 "github.com/onflow/crypto" 10 "github.com/rs/zerolog" 11 "github.com/sony/gobreaker" 12 "google.golang.org/grpc" 13 "google.golang.org/grpc/codes" 14 "google.golang.org/grpc/credentials" 15 "google.golang.org/grpc/credentials/insecure" 16 _ "google.golang.org/grpc/encoding/gzip" //required for gRPC compression 17 "google.golang.org/grpc/keepalive" 18 "google.golang.org/grpc/status" 19 20 _ "github.com/onflow/flow-go/engine/common/grpc/compressor/deflate" //required for gRPC compression 21 _ "github.com/onflow/flow-go/engine/common/grpc/compressor/snappy" //required for gRPC compression 22 "github.com/onflow/flow-go/module" 23 "github.com/onflow/flow-go/utils/grpcutils" 24 ) 25 26 // DefaultClientTimeout is used when making a GRPC request to a collection or execution node. 27 const DefaultClientTimeout = 3 * time.Second 28 29 type noopCloser struct{} 30 31 func (c *noopCloser) Close() error { 32 return nil 33 } 34 35 // Manager provides methods for getting and managing gRPC client connections. 36 type Manager struct { 37 logger zerolog.Logger 38 metrics module.AccessMetrics 39 cache *Cache 40 maxMsgSize uint 41 circuitBreakerConfig CircuitBreakerConfig 42 compressorName string 43 } 44 45 // CircuitBreakerConfig is a configuration struct for the circuit breaker. 46 type CircuitBreakerConfig struct { 47 // Enabled specifies whether the circuit breaker is enabled for collection and execution API clients. 48 Enabled bool 49 // RestoreTimeout specifies the duration after which the circuit breaker will restore the connection to the client 50 // after closing it due to failures. 51 RestoreTimeout time.Duration 52 // MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker 53 // to close the connection. 54 MaxFailures uint32 55 // MaxRequests specifies the maximum number of requests to check if connection restored after timeout. 56 MaxRequests uint32 57 } 58 59 // NewManager creates a new Manager with the specified parameters. 60 func NewManager( 61 logger zerolog.Logger, 62 metrics module.AccessMetrics, 63 cache *Cache, 64 maxMsgSize uint, 65 circuitBreakerConfig CircuitBreakerConfig, 66 compressorName string, 67 ) Manager { 68 return Manager{ 69 cache: cache, 70 logger: logger, 71 metrics: metrics, 72 maxMsgSize: maxMsgSize, 73 circuitBreakerConfig: circuitBreakerConfig, 74 compressorName: compressorName, 75 } 76 } 77 78 // GetConnection returns a gRPC client connection for the given grpcAddress and timeout. 79 // If a cache is used, it retrieves a cached connection, otherwise creates a new connection. 80 // It returns the client connection and an io.Closer to close the connection when done. 81 // The networkPubKey is the public key used for creating secure gRPC connection. Can be nil for an unsecured connection. 82 func (m *Manager) GetConnection( 83 grpcAddress string, 84 timeout time.Duration, 85 networkPubKey crypto.PublicKey, 86 ) (*grpc.ClientConn, io.Closer, error) { 87 if m.cache != nil { 88 client, err := m.cache.GetConnected(grpcAddress, timeout, networkPubKey, m.createConnection) 89 if err != nil { 90 return nil, nil, err 91 } 92 93 return client.ClientConn(), &noopCloser{}, nil 94 } 95 96 conn, err := m.createConnection(grpcAddress, timeout, networkPubKey, nil) 97 if err != nil { 98 return nil, nil, err 99 } 100 101 return conn, io.Closer(conn), nil 102 } 103 104 // createConnection creates a new gRPC connection to the remote node at the given address with the specified timeout. 105 // If the cachedClient is not nil, it means a new entry in the cache is being created, so it's locked to give priority 106 // to the caller working with the new client, allowing it to create the underlying connection. 107 // The networkPubKey is optional and configures a connection level security for gRPC connection. If it is not nil, 108 // it means that it used for creating secure gRPC connection. If it is nil, it means unsecure gRPC connection is being created. 109 func (m *Manager) createConnection( 110 address string, 111 timeout time.Duration, 112 networkPubKey crypto.PublicKey, 113 cachedClient *CachedClient, 114 ) (*grpc.ClientConn, error) { 115 if timeout == 0 { 116 timeout = DefaultClientTimeout 117 } 118 119 keepaliveParams := keepalive.ClientParameters{ 120 Time: 10 * time.Second, // How long the client will wait before sending a keepalive to the server if there is no activity. 121 Timeout: timeout, // How long the client will wait for a response from the keepalive before closing. 122 } 123 124 // The order in which interceptors are added to the `connInterceptors` slice is important since they will be called 125 // in the opposite order during gRPC requests. See documentation for more info: 126 // https://grpc.io/blog/grpc-web-interceptor/#binding-interceptors 127 var connInterceptors []grpc.UnaryClientInterceptor 128 129 if !m.circuitBreakerConfig.Enabled && cachedClient != nil { 130 connInterceptors = append(connInterceptors, m.createClientInvalidationInterceptor(cachedClient)) 131 } 132 133 connInterceptors = append(connInterceptors, createClientTimeoutInterceptor(timeout)) 134 135 // This interceptor monitors ongoing requests before passing control to subsequent interceptors. 136 if cachedClient != nil { 137 connInterceptors = append(connInterceptors, createRequestWatcherInterceptor(cachedClient)) 138 } 139 140 if m.circuitBreakerConfig.Enabled { 141 // If the circuit breaker interceptor is enabled, it should always be called first before passing control to 142 // subsequent interceptors. 143 connInterceptors = append(connInterceptors, m.createCircuitBreakerInterceptor()) 144 } 145 146 // ClientConn's default KeepAlive on connections is indefinite, assuming the timeout isn't reached 147 // The connections should be safe to be persisted and reused. 148 // https://pkg.go.dev/google.golang.org/grpc#WithKeepaliveParams 149 // https://grpc.io/blog/grpc-on-http2/#keeping-connections-alive 150 var opts []grpc.DialOption 151 opts = append(opts, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(m.maxMsgSize)))) 152 opts = append(opts, grpc.WithKeepaliveParams(keepaliveParams)) 153 opts = append(opts, grpc.WithChainUnaryInterceptor(connInterceptors...)) 154 155 if m.compressorName != grpcutils.NoCompressor { 156 opts = append(opts, grpc.WithDefaultCallOptions(grpc.UseCompressor(m.compressorName))) 157 } 158 159 if networkPubKey != nil { 160 tlsConfig, err := grpcutils.DefaultClientTLSConfig(networkPubKey) 161 if err != nil { 162 return nil, fmt.Errorf("failed to get default TLS client config using public flow networking key %s %w", networkPubKey.String(), err) 163 } 164 opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig))) 165 } else { 166 opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) 167 } 168 169 conn, err := grpc.Dial( 170 address, 171 opts..., 172 ) 173 if err != nil { 174 return nil, fmt.Errorf("failed to connect to address %s: %w", address, err) 175 } 176 return conn, nil 177 } 178 179 // createRequestWatcherInterceptor creates a request watcher interceptor to wait for unfinished requests before closing. 180 func createRequestWatcherInterceptor(cachedClient *CachedClient) grpc.UnaryClientInterceptor { 181 requestWatcherInterceptor := func( 182 ctx context.Context, 183 method string, 184 req interface{}, 185 reply interface{}, 186 cc *grpc.ClientConn, 187 invoker grpc.UnaryInvoker, 188 opts ...grpc.CallOption, 189 ) error { 190 // Prevent new requests from being sent if the connection is marked for closure. 191 if cachedClient.CloseRequested() { 192 return status.Errorf(codes.Unavailable, "the connection to %s was closed", cachedClient.Address()) 193 } 194 195 // Increment the request counter to track ongoing requests, then decrement the request counter before returning. 196 done := cachedClient.AddRequest() 197 defer done() 198 199 // Invoke the actual RPC method. 200 return invoker(ctx, method, req, reply, cc, opts...) 201 } 202 203 return requestWatcherInterceptor 204 } 205 206 // WithClientTimeoutOption is a helper function to create a GRPC dial option 207 // with the specified client timeout interceptor. 208 func WithClientTimeoutOption(timeout time.Duration) grpc.DialOption { 209 return grpc.WithUnaryInterceptor(createClientTimeoutInterceptor(timeout)) 210 } 211 212 // createClientTimeoutInterceptor creates a client interceptor with a context that expires after the timeout. 213 func createClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor { 214 clientTimeoutInterceptor := func( 215 ctx context.Context, 216 method string, 217 req interface{}, 218 reply interface{}, 219 cc *grpc.ClientConn, 220 invoker grpc.UnaryInvoker, 221 opts ...grpc.CallOption, 222 ) error { 223 // Create a context that expires after the specified timeout. 224 ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout) 225 defer cancel() 226 227 // Call the remote GRPC using the short context. 228 err := invoker(ctxWithTimeout, method, req, reply, cc, opts...) 229 230 return err 231 } 232 233 return clientTimeoutInterceptor 234 } 235 236 // createClientInvalidationInterceptor creates a client interceptor for client invalidation. It should only be created 237 // if the circuit breaker is disabled. If the response from the server indicates an unavailable status, it invalidates 238 // the corresponding client. 239 func (m *Manager) createClientInvalidationInterceptor(cachedClient *CachedClient) grpc.UnaryClientInterceptor { 240 return func( 241 ctx context.Context, 242 method string, 243 req interface{}, 244 reply interface{}, 245 cc *grpc.ClientConn, 246 invoker grpc.UnaryInvoker, 247 opts ...grpc.CallOption, 248 ) error { 249 err := invoker(ctx, method, req, reply, cc, opts...) 250 if status.Code(err) == codes.Unavailable { 251 cachedClient.Invalidate() 252 } 253 254 return err 255 } 256 } 257 258 // The simplified representation and description of circuit breaker pattern, that used to handle node connectivity: 259 // 260 // Circuit Open --> Circuit Half-Open --> Circuit Closed 261 // ^ | 262 // | | 263 // +--------------------------------------+ 264 // 265 // The "Circuit Open" state represents the circuit being open, indicating that the node is not available. 266 // This state is entered when the number of consecutive failures exceeds the maximum allowed failures. 267 // 268 // The "Circuit Half-Open" state represents the circuit transitioning from the open state to the half-open 269 // state after a configured restore timeout. In this state, the circuit allows a limited number of requests 270 // to test if the node has recovered. 271 // 272 // The "Circuit Closed" state represents the circuit being closed, indicating that the node is available. 273 // This state is initial or entered when the test requests in the half-open state succeed. 274 275 // createCircuitBreakerInterceptor creates a client interceptor for circuit breaker functionality. It should only be 276 // created if the circuit breaker is enabled. All invocations will go through the circuit breaker to be tracked for 277 // success or failure of the call. 278 func (m *Manager) createCircuitBreakerInterceptor() grpc.UnaryClientInterceptor { 279 if m.circuitBreakerConfig.Enabled { 280 circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{ 281 // Timeout defines how long the circuit breaker will remain open before transitioning to the HalfClose state. 282 Timeout: m.circuitBreakerConfig.RestoreTimeout, 283 // ReadyToTrip returns true when the circuit breaker should trip and transition to the Open state 284 ReadyToTrip: func(counts gobreaker.Counts) bool { 285 // The number of maximum failures is checked before the circuit breaker goes to the Open state. 286 return counts.ConsecutiveFailures >= m.circuitBreakerConfig.MaxFailures 287 }, 288 // MaxRequests defines the max number of concurrent requests while the circuit breaker is in the HalfClosed 289 // state. 290 MaxRequests: m.circuitBreakerConfig.MaxRequests, 291 // IsSuccessful defines gRPC status codes that should be treated as a successful result for the circuit breaker. 292 IsSuccessful: func(err error) bool { 293 if se, ok := status.FromError(err); ok { 294 if se == nil { 295 return true 296 } 297 298 // There are several error cases that may occur during normal operation and should be considered 299 // as "successful" from the perspective of the circuit breaker. 300 switch se.Code() { 301 case codes.OK, codes.Canceled, codes.InvalidArgument, codes.NotFound, codes.Unimplemented, codes.OutOfRange: 302 return true 303 default: 304 return false 305 } 306 } 307 308 return false 309 }, 310 }) 311 312 circuitBreakerInterceptor := func( 313 ctx context.Context, 314 method string, 315 req interface{}, 316 reply interface{}, 317 cc *grpc.ClientConn, 318 invoker grpc.UnaryInvoker, 319 opts ...grpc.CallOption, 320 ) error { 321 // The circuit breaker integration occurs here, where all invoked calls to the node pass through the 322 // CircuitBreaker.Execute method. This method counts successful and failed invocations, and switches to the 323 // "StateOpen" when the maximum failure threshold is reached. When the circuit breaker is in the "StateOpen" 324 // it immediately rejects connections and returns without waiting for the call timeout. After the 325 // "RestoreTimeout" period elapses, the circuit breaker transitions to the "StateHalfOpen" and attempts the 326 // invocation again. If the invocation fails, it returns to the "StateOpen"; otherwise, it transitions to 327 // the "StateClosed" and handles invocations as usual. 328 _, err := circuitBreaker.Execute(func() (interface{}, error) { 329 err := invoker(ctx, method, req, reply, cc, opts...) 330 return nil, err 331 }) 332 return err 333 } 334 335 return circuitBreakerInterceptor 336 } 337 338 return nil 339 }