github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/access/rpc/connection/manager.go (about)

     1  package connection
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"time"
     8  
     9  	"github.com/onflow/crypto"
    10  	"github.com/rs/zerolog"
    11  	"github.com/sony/gobreaker"
    12  	"google.golang.org/grpc"
    13  	"google.golang.org/grpc/codes"
    14  	"google.golang.org/grpc/credentials"
    15  	"google.golang.org/grpc/credentials/insecure"
    16  	_ "google.golang.org/grpc/encoding/gzip" //required for gRPC compression
    17  	"google.golang.org/grpc/keepalive"
    18  	"google.golang.org/grpc/status"
    19  
    20  	_ "github.com/onflow/flow-go/engine/common/grpc/compressor/deflate" //required for gRPC compression
    21  	_ "github.com/onflow/flow-go/engine/common/grpc/compressor/snappy"  //required for gRPC compression
    22  	"github.com/onflow/flow-go/module"
    23  	"github.com/onflow/flow-go/utils/grpcutils"
    24  )
    25  
    26  // DefaultClientTimeout is used when making a GRPC request to a collection or execution node.
    27  const DefaultClientTimeout = 3 * time.Second
    28  
    29  type noopCloser struct{}
    30  
    31  func (c *noopCloser) Close() error {
    32  	return nil
    33  }
    34  
    35  // Manager provides methods for getting and managing gRPC client connections.
    36  type Manager struct {
    37  	logger               zerolog.Logger
    38  	metrics              module.AccessMetrics
    39  	cache                *Cache
    40  	maxMsgSize           uint
    41  	circuitBreakerConfig CircuitBreakerConfig
    42  	compressorName       string
    43  }
    44  
    45  // CircuitBreakerConfig is a configuration struct for the circuit breaker.
    46  type CircuitBreakerConfig struct {
    47  	// Enabled specifies whether the circuit breaker is enabled for collection and execution API clients.
    48  	Enabled bool
    49  	// RestoreTimeout specifies the duration after which the circuit breaker will restore the connection to the client
    50  	// after closing it due to failures.
    51  	RestoreTimeout time.Duration
    52  	// MaxFailures specifies the maximum number of failed calls to the client that will cause the circuit breaker
    53  	// to close the connection.
    54  	MaxFailures uint32
    55  	// MaxRequests specifies the maximum number of requests to check if connection restored after timeout.
    56  	MaxRequests uint32
    57  }
    58  
    59  // NewManager creates a new Manager with the specified parameters.
    60  func NewManager(
    61  	logger zerolog.Logger,
    62  	metrics module.AccessMetrics,
    63  	cache *Cache,
    64  	maxMsgSize uint,
    65  	circuitBreakerConfig CircuitBreakerConfig,
    66  	compressorName string,
    67  ) Manager {
    68  	return Manager{
    69  		cache:                cache,
    70  		logger:               logger,
    71  		metrics:              metrics,
    72  		maxMsgSize:           maxMsgSize,
    73  		circuitBreakerConfig: circuitBreakerConfig,
    74  		compressorName:       compressorName,
    75  	}
    76  }
    77  
    78  // GetConnection returns a gRPC client connection for the given grpcAddress and timeout.
    79  // If a cache is used, it retrieves a cached connection, otherwise creates a new connection.
    80  // It returns the client connection and an io.Closer to close the connection when done.
    81  // The networkPubKey is the public key used for creating secure gRPC connection. Can be nil for an unsecured connection.
    82  func (m *Manager) GetConnection(
    83  	grpcAddress string,
    84  	timeout time.Duration,
    85  	networkPubKey crypto.PublicKey,
    86  ) (*grpc.ClientConn, io.Closer, error) {
    87  	if m.cache != nil {
    88  		client, err := m.cache.GetConnected(grpcAddress, timeout, networkPubKey, m.createConnection)
    89  		if err != nil {
    90  			return nil, nil, err
    91  		}
    92  
    93  		return client.ClientConn(), &noopCloser{}, nil
    94  	}
    95  
    96  	conn, err := m.createConnection(grpcAddress, timeout, networkPubKey, nil)
    97  	if err != nil {
    98  		return nil, nil, err
    99  	}
   100  
   101  	return conn, io.Closer(conn), nil
   102  }
   103  
   104  // createConnection creates a new gRPC connection to the remote node at the given address with the specified timeout.
   105  // If the cachedClient is not nil, it means a new entry in the cache is being created, so it's locked to give priority
   106  // to the caller working with the new client, allowing it to create the underlying connection.
   107  // The networkPubKey is optional and configures a connection level security for gRPC connection. If it is not nil,
   108  // it means that it used for creating secure gRPC connection. If it is nil, it means unsecure gRPC connection is being created.
   109  func (m *Manager) createConnection(
   110  	address string,
   111  	timeout time.Duration,
   112  	networkPubKey crypto.PublicKey,
   113  	cachedClient *CachedClient,
   114  ) (*grpc.ClientConn, error) {
   115  	if timeout == 0 {
   116  		timeout = DefaultClientTimeout
   117  	}
   118  
   119  	keepaliveParams := keepalive.ClientParameters{
   120  		Time:    10 * time.Second, // How long the client will wait before sending a keepalive to the server if there is no activity.
   121  		Timeout: timeout,          // How long the client will wait for a response from the keepalive before closing.
   122  	}
   123  
   124  	// The order in which interceptors are added to the `connInterceptors` slice is important since they will be called
   125  	// in the opposite order during gRPC requests. See documentation for more info:
   126  	// https://grpc.io/blog/grpc-web-interceptor/#binding-interceptors
   127  	var connInterceptors []grpc.UnaryClientInterceptor
   128  
   129  	if !m.circuitBreakerConfig.Enabled && cachedClient != nil {
   130  		connInterceptors = append(connInterceptors, m.createClientInvalidationInterceptor(cachedClient))
   131  	}
   132  
   133  	connInterceptors = append(connInterceptors, createClientTimeoutInterceptor(timeout))
   134  
   135  	// This interceptor monitors ongoing requests before passing control to subsequent interceptors.
   136  	if cachedClient != nil {
   137  		connInterceptors = append(connInterceptors, createRequestWatcherInterceptor(cachedClient))
   138  	}
   139  
   140  	if m.circuitBreakerConfig.Enabled {
   141  		// If the circuit breaker interceptor is enabled, it should always be called first before passing control to
   142  		// subsequent interceptors.
   143  		connInterceptors = append(connInterceptors, m.createCircuitBreakerInterceptor())
   144  	}
   145  
   146  	// ClientConn's default KeepAlive on connections is indefinite, assuming the timeout isn't reached
   147  	// The connections should be safe to be persisted and reused.
   148  	// https://pkg.go.dev/google.golang.org/grpc#WithKeepaliveParams
   149  	// https://grpc.io/blog/grpc-on-http2/#keeping-connections-alive
   150  	var opts []grpc.DialOption
   151  	opts = append(opts, grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(int(m.maxMsgSize))))
   152  	opts = append(opts, grpc.WithKeepaliveParams(keepaliveParams))
   153  	opts = append(opts, grpc.WithChainUnaryInterceptor(connInterceptors...))
   154  
   155  	if m.compressorName != grpcutils.NoCompressor {
   156  		opts = append(opts, grpc.WithDefaultCallOptions(grpc.UseCompressor(m.compressorName)))
   157  	}
   158  
   159  	if networkPubKey != nil {
   160  		tlsConfig, err := grpcutils.DefaultClientTLSConfig(networkPubKey)
   161  		if err != nil {
   162  			return nil, fmt.Errorf("failed to get default TLS client config using public flow networking key %s %w", networkPubKey.String(), err)
   163  		}
   164  		opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)))
   165  	} else {
   166  		opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials()))
   167  	}
   168  
   169  	conn, err := grpc.Dial(
   170  		address,
   171  		opts...,
   172  	)
   173  	if err != nil {
   174  		return nil, fmt.Errorf("failed to connect to address %s: %w", address, err)
   175  	}
   176  	return conn, nil
   177  }
   178  
   179  // createRequestWatcherInterceptor creates a request watcher interceptor to wait for unfinished requests before closing.
   180  func createRequestWatcherInterceptor(cachedClient *CachedClient) grpc.UnaryClientInterceptor {
   181  	requestWatcherInterceptor := func(
   182  		ctx context.Context,
   183  		method string,
   184  		req interface{},
   185  		reply interface{},
   186  		cc *grpc.ClientConn,
   187  		invoker grpc.UnaryInvoker,
   188  		opts ...grpc.CallOption,
   189  	) error {
   190  		// Prevent new requests from being sent if the connection is marked for closure.
   191  		if cachedClient.CloseRequested() {
   192  			return status.Errorf(codes.Unavailable, "the connection to %s was closed", cachedClient.Address())
   193  		}
   194  
   195  		// Increment the request counter to track ongoing requests, then decrement the request counter before returning.
   196  		done := cachedClient.AddRequest()
   197  		defer done()
   198  
   199  		// Invoke the actual RPC method.
   200  		return invoker(ctx, method, req, reply, cc, opts...)
   201  	}
   202  
   203  	return requestWatcherInterceptor
   204  }
   205  
   206  // WithClientTimeoutOption is a helper function to create a GRPC dial option
   207  // with the specified client timeout interceptor.
   208  func WithClientTimeoutOption(timeout time.Duration) grpc.DialOption {
   209  	return grpc.WithUnaryInterceptor(createClientTimeoutInterceptor(timeout))
   210  }
   211  
   212  // createClientTimeoutInterceptor creates a client interceptor with a context that expires after the timeout.
   213  func createClientTimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor {
   214  	clientTimeoutInterceptor := func(
   215  		ctx context.Context,
   216  		method string,
   217  		req interface{},
   218  		reply interface{},
   219  		cc *grpc.ClientConn,
   220  		invoker grpc.UnaryInvoker,
   221  		opts ...grpc.CallOption,
   222  	) error {
   223  		// Create a context that expires after the specified timeout.
   224  		ctxWithTimeout, cancel := context.WithTimeout(ctx, timeout)
   225  		defer cancel()
   226  
   227  		// Call the remote GRPC using the short context.
   228  		err := invoker(ctxWithTimeout, method, req, reply, cc, opts...)
   229  
   230  		return err
   231  	}
   232  
   233  	return clientTimeoutInterceptor
   234  }
   235  
   236  // createClientInvalidationInterceptor creates a client interceptor for client invalidation. It should only be created
   237  // if the circuit breaker is disabled. If the response from the server indicates an unavailable status, it invalidates
   238  // the corresponding client.
   239  func (m *Manager) createClientInvalidationInterceptor(cachedClient *CachedClient) grpc.UnaryClientInterceptor {
   240  	return func(
   241  		ctx context.Context,
   242  		method string,
   243  		req interface{},
   244  		reply interface{},
   245  		cc *grpc.ClientConn,
   246  		invoker grpc.UnaryInvoker,
   247  		opts ...grpc.CallOption,
   248  	) error {
   249  		err := invoker(ctx, method, req, reply, cc, opts...)
   250  		if status.Code(err) == codes.Unavailable {
   251  			cachedClient.Invalidate()
   252  		}
   253  
   254  		return err
   255  	}
   256  }
   257  
   258  // The simplified representation and description of circuit breaker pattern, that used to handle node connectivity:
   259  //
   260  // Circuit Open --> Circuit Half-Open --> Circuit Closed
   261  //      ^                                      |
   262  //      |                                      |
   263  //      +--------------------------------------+
   264  //
   265  // The "Circuit Open" state represents the circuit being open, indicating that the node is not available.
   266  // This state is entered when the number of consecutive failures exceeds the maximum allowed failures.
   267  //
   268  // The "Circuit Half-Open" state represents the circuit transitioning from the open state to the half-open
   269  // state after a configured restore timeout. In this state, the circuit allows a limited number of requests
   270  // to test if the node has recovered.
   271  //
   272  // The "Circuit Closed" state represents the circuit being closed, indicating that the node is available.
   273  // This state is initial or entered when the test requests in the half-open state succeed.
   274  
   275  // createCircuitBreakerInterceptor creates a client interceptor for circuit breaker functionality. It should only be
   276  // created if the circuit breaker is enabled. All invocations will go through the circuit breaker to be tracked for
   277  // success or failure of the call.
   278  func (m *Manager) createCircuitBreakerInterceptor() grpc.UnaryClientInterceptor {
   279  	if m.circuitBreakerConfig.Enabled {
   280  		circuitBreaker := gobreaker.NewCircuitBreaker(gobreaker.Settings{
   281  			// Timeout defines how long the circuit breaker will remain open before transitioning to the HalfClose state.
   282  			Timeout: m.circuitBreakerConfig.RestoreTimeout,
   283  			// ReadyToTrip returns true when the circuit breaker should trip and transition to the Open state
   284  			ReadyToTrip: func(counts gobreaker.Counts) bool {
   285  				// The number of maximum failures is checked before the circuit breaker goes to the Open state.
   286  				return counts.ConsecutiveFailures >= m.circuitBreakerConfig.MaxFailures
   287  			},
   288  			// MaxRequests defines the max number of concurrent requests while the circuit breaker is in the HalfClosed
   289  			// state.
   290  			MaxRequests: m.circuitBreakerConfig.MaxRequests,
   291  			// IsSuccessful defines gRPC status codes that should be treated as a successful result for the circuit breaker.
   292  			IsSuccessful: func(err error) bool {
   293  				if se, ok := status.FromError(err); ok {
   294  					if se == nil {
   295  						return true
   296  					}
   297  
   298  					// There are several error cases that may occur during normal operation and should be considered
   299  					// as "successful" from the perspective of the circuit breaker.
   300  					switch se.Code() {
   301  					case codes.OK, codes.Canceled, codes.InvalidArgument, codes.NotFound, codes.Unimplemented, codes.OutOfRange:
   302  						return true
   303  					default:
   304  						return false
   305  					}
   306  				}
   307  
   308  				return false
   309  			},
   310  		})
   311  
   312  		circuitBreakerInterceptor := func(
   313  			ctx context.Context,
   314  			method string,
   315  			req interface{},
   316  			reply interface{},
   317  			cc *grpc.ClientConn,
   318  			invoker grpc.UnaryInvoker,
   319  			opts ...grpc.CallOption,
   320  		) error {
   321  			// The circuit breaker integration occurs here, where all invoked calls to the node pass through the
   322  			// CircuitBreaker.Execute method. This method counts successful and failed invocations, and switches to the
   323  			// "StateOpen" when the maximum failure threshold is reached. When the circuit breaker is in the "StateOpen"
   324  			// it immediately rejects connections and returns without waiting for the call timeout. After the
   325  			// "RestoreTimeout" period elapses, the circuit breaker transitions to the "StateHalfOpen" and attempts the
   326  			// invocation again. If the invocation fails, it returns to the "StateOpen"; otherwise, it transitions to
   327  			// the "StateClosed" and handles invocations as usual.
   328  			_, err := circuitBreaker.Execute(func() (interface{}, error) {
   329  				err := invoker(ctx, method, req, reply, cc, opts...)
   330  				return nil, err
   331  			})
   332  			return err
   333  		}
   334  
   335  		return circuitBreakerInterceptor
   336  	}
   337  
   338  	return nil
   339  }