github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/client/client.go (about)

     1  package segmentwriterclient
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"strconv"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/go-kit/log"
    12  	"github.com/go-kit/log/level"
    13  	"github.com/grafana/dskit/grpcclient"
    14  	"github.com/grafana/dskit/ring"
    15  	ring_client "github.com/grafana/dskit/ring/client"
    16  	"github.com/grafana/dskit/services"
    17  	"github.com/pkg/errors"
    18  	"github.com/prometheus/client_golang/prometheus"
    19  	"github.com/sony/gobreaker/v2"
    20  	"google.golang.org/grpc"
    21  	"google.golang.org/grpc/codes"
    22  	"google.golang.org/grpc/status"
    23  
    24  	segmentwriterv1 "github.com/grafana/pyroscope/api/gen/proto/go/segmentwriter/v1"
    25  	"github.com/grafana/pyroscope/pkg/segmentwriter/client/connpool"
    26  	"github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor"
    27  	"github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement"
    28  	"github.com/grafana/pyroscope/pkg/util/circuitbreaker"
    29  )
    30  
    31  var errServiceUnavailableMsg = "service is unavailable"
    32  
    33  // TODO(kolesnikovae):
    34  //  * Replace the ring service discovery and client pool implementations.
    35  //  * Make CB options configurable.
    36  
    37  const (
    38  	// Circuit breaker defaults.
    39  	cbMinSuccess     = 5
    40  	cbMaxFailures    = 3
    41  	cbClosedInterval = 0
    42  	cbOpenTimeout    = time.Second
    43  
    44  	poolCleanupPeriod = 15 * time.Second
    45  )
    46  
    47  // Only these errors are considered as a signal to retry the request
    48  // and send it to another instance. Client-side, internal, and unknown
    49  // errors should not be retried, as they are likely to be permanent.
    50  // Note that the client errors are not excluded from the list.
    51  func isRetryable(err error) bool {
    52  	switch status.Code(err) {
    53  	case codes.Unknown,
    54  		codes.Internal,
    55  		codes.FailedPrecondition:
    56  		return false
    57  	default:
    58  		// All sorts of network errors.
    59  		return true
    60  	}
    61  }
    62  
    63  // Client errors are returned as is without retries.
    64  // Any other error is substituted with a stub message
    65  // and UNAVAILABLE status.
    66  func isClientError(err error) bool {
    67  	switch status.Code(err) {
    68  	case codes.InvalidArgument,
    69  		codes.Canceled,
    70  		codes.PermissionDenied,
    71  		codes.Unauthenticated:
    72  		return true
    73  	default:
    74  		return errors.Is(err, context.Canceled)
    75  	}
    76  }
    77  
    78  // https://en.wikipedia.org/wiki/Circuit_breaker_design_pattern
    79  // The circuit breaker is used to prevent the client from sending
    80  // requests to unhealthy instances. The logic is as follows:
    81  //
    82  // Once we observe 3 consecutive failures, the circuit breaker will trip
    83  // and open the circuit – any attempt to send a request will fail
    84  // immediately with a "circuit breaker is open" error (UNAVAILABLE).
    85  //
    86  // After the expiration of the Timeout (5 seconds), the circuit breaker will
    87  // transition to the half-open state. In this state, if a failure occurs,
    88  // the breaker will revert to the open state. After MaxRequests (5)
    89  // consecutive successful requests, the circuit breaker will return to the
    90  // closed state.
    91  var circuitBreakerConfig = gobreaker.Settings{
    92  	MaxRequests:  cbMinSuccess,
    93  	Interval:     cbClosedInterval,
    94  	Timeout:      cbOpenTimeout,
    95  	IsSuccessful: shouldBeHandledByCaller,
    96  	ReadyToTrip: func(counts gobreaker.Counts) bool {
    97  		return counts.ConsecutiveFailures >= cbMaxFailures
    98  	},
    99  }
   100  
   101  // If the function returns false, the error is counted towards tripping
   102  // the open state, when no requests flow through the circuit. Otherwise,
   103  // the error handling is returned back the caller.
   104  //
   105  // In fact, the configuration should only prevent sending requests
   106  // to instances that are a-priory unable to process them at the moment,
   107  // and we want to avoid time waste. For example, when a service instance
   108  // went unavailable for a long period of time, or is not reposing in
   109  // timely fashion.
   110  //
   111  // From the caller perspective, we're converting those to UNAVAILABLE,
   112  // thereby allowing the caller to retry the request against another service
   113  // instance.
   114  //
   115  // Note that client-side, internal, and unknown errors are not included:
   116  // in case if a request is failing permanently regardless of the service
   117  // instance, there is a good chance that all the circuits will be opened
   118  // by retries, making the whole service unavailable.
   119  //
   120  // Next, ResourceExhausted also excluded from the list: as the error is
   121  // tenant-request-specific, and the circuit breaker operates connection-wise.
   122  func shouldBeHandledByCaller(err error) bool {
   123  	if errors.Is(err, os.ErrDeadlineExceeded) {
   124  		return false
   125  	}
   126  	if status.Code(err) == codes.Unavailable {
   127  		return false
   128  	}
   129  	// The error handling is returned back the caller: the circuit
   130  	// remains closed.
   131  	return true
   132  }
   133  
   134  // The default gRPC service config is explicitly set to balance between
   135  // instances.
   136  const grpcServiceConfig = `{
   137      "healthCheckConfig": {
   138           "serviceName": "pyroscope.segment-writer"
   139      }
   140  }`
   141  
   142  type Client struct {
   143  	logger  log.Logger
   144  	metrics *metrics
   145  
   146  	ring        ring.ReadRing
   147  	pool        *connpool.Pool
   148  	distributor *distributor.Distributor
   149  
   150  	service     services.Service
   151  	subservices *services.Manager
   152  	watcher     *services.FailureWatcher
   153  }
   154  
   155  func NewSegmentWriterClient(
   156  	grpcClientConfig grpcclient.Config,
   157  	logger log.Logger,
   158  	registry prometheus.Registerer,
   159  	ring ring.ReadRing,
   160  	placement placement.Placement,
   161  	dialOpts ...grpc.DialOption,
   162  ) (*Client, error) {
   163  	pool, err := newConnPool(ring, logger, grpcClientConfig, dialOpts...)
   164  	if err != nil {
   165  		return nil, err
   166  	}
   167  	c := &Client{
   168  		logger:      logger,
   169  		metrics:     newMetrics(registry),
   170  		distributor: distributor.NewDistributor(placement, ring),
   171  		pool:        pool,
   172  		ring:        ring,
   173  	}
   174  	c.subservices, err = services.NewManager(c.pool)
   175  	if err != nil {
   176  		return nil, fmt.Errorf("services manager: %w", err)
   177  	}
   178  	c.watcher = services.NewFailureWatcher()
   179  	c.watcher.WatchManager(c.subservices)
   180  	c.service = services.NewBasicService(c.starting, c.running, c.stopping)
   181  	return c, nil
   182  }
   183  
   184  func (c *Client) Service() services.Service { return c.service }
   185  
   186  func (c *Client) starting(ctx context.Context) error {
   187  	// Warm up connections. The pool does not do this.
   188  	instances, err := c.ring.GetAllHealthy(ring.Reporting)
   189  	if err != nil {
   190  		// The ring might be empty initially if the segment-writer service
   191  		// is not yet ready. In such cases, we avoid failing the client to
   192  		// allow for eventual readiness.
   193  		level.Debug(c.logger).Log("msg", "unable to create connections", "err", err)
   194  	} else {
   195  		var wg sync.WaitGroup
   196  		for _, x := range instances.Instances {
   197  			wg.Add(1)
   198  			go func(x ring.InstanceDesc) {
   199  				defer wg.Done()
   200  				_, _ = c.pool.GetClientFor(x.Addr)
   201  			}(x)
   202  		}
   203  		wg.Wait()
   204  	}
   205  	return services.StartManagerAndAwaitHealthy(ctx, c.subservices)
   206  }
   207  
   208  func (c *Client) running(ctx context.Context) error {
   209  	select {
   210  	case <-ctx.Done():
   211  		return nil
   212  	case err := <-c.watcher.Chan():
   213  		return fmt.Errorf("segement writer client subservice failed: %w", err)
   214  	}
   215  }
   216  
   217  func (c *Client) stopping(_ error) error {
   218  	return services.StopManagerAndAwaitStopped(context.Background(), c.subservices)
   219  }
   220  
   221  func (c *Client) Push(
   222  	ctx context.Context,
   223  	req *segmentwriterv1.PushRequest,
   224  ) (resp *segmentwriterv1.PushResponse, err error) {
   225  	k := distributor.NewTenantServiceDatasetKey(req.TenantId, req.Labels...)
   226  	p, dErr := c.distributor.Distribute(k)
   227  	if dErr != nil {
   228  		level.Error(c.logger).Log(
   229  			"msg", "unable to distribute request",
   230  			"tenant", req.TenantId,
   231  			"err", dErr,
   232  		)
   233  		return nil, status.Error(codes.Unavailable, errServiceUnavailableMsg)
   234  	}
   235  
   236  	// In case of a failure, the request is sent to another instance.
   237  	// At most 5 attempts to push the data to the segment writer.
   238  	instances := placement.ActiveInstances(p.Instances)
   239  	req.Shard = p.Shard
   240  	for attempts := 5; attempts >= 0 && instances.Next(); attempts-- {
   241  		instance := instances.At()
   242  		logger := log.With(c.logger,
   243  			"tenant", req.TenantId,
   244  			"shard", req.Shard,
   245  			"instance_addr", instance.Addr,
   246  			"instance_id", instance.Id,
   247  			"attempts_left", attempts,
   248  		)
   249  		level.Debug(logger).Log("msg", "sending request")
   250  		resp, err = c.pushToInstance(ctx, req, instance.Addr)
   251  		if err == nil {
   252  			return resp, nil
   253  		}
   254  		if isClientError(err) {
   255  			return nil, err
   256  		}
   257  		if !isRetryable(err) {
   258  			level.Error(logger).Log("msg", "failed to push data to segment writer", "err", err)
   259  			return nil, status.Error(codes.Unavailable, errServiceUnavailableMsg)
   260  		}
   261  		level.Warn(logger).Log("msg", "failed attempt to push data to segment writer", "err", err)
   262  		if ctxErr := ctx.Err(); ctxErr != nil {
   263  			return nil, ctxErr
   264  		}
   265  	}
   266  
   267  	level.Error(c.logger).Log(
   268  		"msg", "no segment writer instances available for the request",
   269  		"tenant", req.TenantId,
   270  		"shard", req.Shard,
   271  		"last_err", err,
   272  	)
   273  
   274  	return nil, status.Error(codes.Unavailable, errServiceUnavailableMsg)
   275  }
   276  
   277  func (c *Client) pushToInstance(
   278  	ctx context.Context,
   279  	req *segmentwriterv1.PushRequest,
   280  	addr string,
   281  ) (*segmentwriterv1.PushResponse, error) {
   282  	conn, err := c.pool.GetConnFor(addr)
   283  	if err != nil {
   284  		return nil, err
   285  	}
   286  	// We explicitly force the client to not wait for the connection:
   287  	// if the connection is not ready, the client will go to the next
   288  	// instance.
   289  	client := segmentwriterv1.NewSegmentWriterServiceClient(conn)
   290  	resp, err := client.Push(ctx, req, grpc.WaitForReady(false))
   291  	if err == nil {
   292  		c.metrics.sentBytes.
   293  			WithLabelValues(strconv.Itoa(int(req.Shard)), req.TenantId, addr).
   294  			Observe(float64(len(req.Profile)))
   295  	}
   296  	return resp, err
   297  }
   298  
   299  func newConnPool(
   300  	rring ring.ReadRing,
   301  	logger log.Logger,
   302  	grpcClientConfig grpcclient.Config,
   303  	dialOpts ...grpc.DialOption,
   304  ) (*connpool.Pool, error) {
   305  	options, err := grpcClientConfig.DialOption(nil, nil, nil)
   306  	if err != nil {
   307  		return nil, err
   308  	}
   309  
   310  	// The options (including interceptors) are shared by all client connections.
   311  	options = append(options, dialOpts...)
   312  	options = append(options,
   313  		grpc.WithDefaultServiceConfig(grpcServiceConfig),
   314  		// Just in case: we explicitly disable the built-in
   315  		// retry mechanism of the gRPC client.
   316  		grpc.WithDisableRetry(),
   317  	)
   318  
   319  	// Note that circuit breaker must be created per client conn.
   320  	factory := connpool.NewConnPoolFactory(func(ring.InstanceDesc) []grpc.DialOption {
   321  		cb := circuitbreaker.UnaryClientInterceptor(gobreaker.NewCircuitBreaker[any](circuitBreakerConfig))
   322  		return append(options, grpc.WithUnaryInterceptor(cb))
   323  	})
   324  
   325  	p := ring_client.NewPool(
   326  		"segment-writer",
   327  		ring_client.PoolConfig{
   328  			CheckInterval: poolCleanupPeriod,
   329  			// Note that health checks are not used: gGRPC health-checking
   330  			// is done at the gRPC connection level.
   331  			HealthCheckEnabled:        false,
   332  			HealthCheckTimeout:        0,
   333  			MaxConcurrentHealthChecks: 0,
   334  		},
   335  		// Discovery is used to remove clients that can't be found
   336  		// in the ring, including unhealthy instances. CheckInterval
   337  		// specifies how frequently the stale clients are removed.
   338  		// Discovery builds a list of healthy instances.
   339  		// An instance is healthy, if it's heartbeat timestamp
   340  		// is not older than a configured threshold (intrinsic
   341  		// to the ring itself).
   342  		ring_client.NewRingServiceDiscovery(rring),
   343  		factory,
   344  		nil, // Client count gauge is not used.
   345  		logger,
   346  	)
   347  
   348  	return &connpool.Pool{Pool: p}, nil
   349  }