github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/client/client.go (about) 1 package segmentwriterclient 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "strconv" 8 "sync" 9 "time" 10 11 "github.com/go-kit/log" 12 "github.com/go-kit/log/level" 13 "github.com/grafana/dskit/grpcclient" 14 "github.com/grafana/dskit/ring" 15 ring_client "github.com/grafana/dskit/ring/client" 16 "github.com/grafana/dskit/services" 17 "github.com/pkg/errors" 18 "github.com/prometheus/client_golang/prometheus" 19 "github.com/sony/gobreaker/v2" 20 "google.golang.org/grpc" 21 "google.golang.org/grpc/codes" 22 "google.golang.org/grpc/status" 23 24 segmentwriterv1 "github.com/grafana/pyroscope/api/gen/proto/go/segmentwriter/v1" 25 "github.com/grafana/pyroscope/pkg/segmentwriter/client/connpool" 26 "github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor" 27 "github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement" 28 "github.com/grafana/pyroscope/pkg/util/circuitbreaker" 29 ) 30 31 var errServiceUnavailableMsg = "service is unavailable" 32 33 // TODO(kolesnikovae): 34 // * Replace the ring service discovery and client pool implementations. 35 // * Make CB options configurable. 36 37 const ( 38 // Circuit breaker defaults. 39 cbMinSuccess = 5 40 cbMaxFailures = 3 41 cbClosedInterval = 0 42 cbOpenTimeout = time.Second 43 44 poolCleanupPeriod = 15 * time.Second 45 ) 46 47 // Only these errors are considered as a signal to retry the request 48 // and send it to another instance. Client-side, internal, and unknown 49 // errors should not be retried, as they are likely to be permanent. 50 // Note that the client errors are not excluded from the list. 51 func isRetryable(err error) bool { 52 switch status.Code(err) { 53 case codes.Unknown, 54 codes.Internal, 55 codes.FailedPrecondition: 56 return false 57 default: 58 // All sorts of network errors. 59 return true 60 } 61 } 62 63 // Client errors are returned as is without retries. 64 // Any other error is substituted with a stub message 65 // and UNAVAILABLE status. 66 func isClientError(err error) bool { 67 switch status.Code(err) { 68 case codes.InvalidArgument, 69 codes.Canceled, 70 codes.PermissionDenied, 71 codes.Unauthenticated: 72 return true 73 default: 74 return errors.Is(err, context.Canceled) 75 } 76 } 77 78 // https://en.wikipedia.org/wiki/Circuit_breaker_design_pattern 79 // The circuit breaker is used to prevent the client from sending 80 // requests to unhealthy instances. The logic is as follows: 81 // 82 // Once we observe 3 consecutive failures, the circuit breaker will trip 83 // and open the circuit – any attempt to send a request will fail 84 // immediately with a "circuit breaker is open" error (UNAVAILABLE). 85 // 86 // After the expiration of the Timeout (5 seconds), the circuit breaker will 87 // transition to the half-open state. In this state, if a failure occurs, 88 // the breaker will revert to the open state. After MaxRequests (5) 89 // consecutive successful requests, the circuit breaker will return to the 90 // closed state. 91 var circuitBreakerConfig = gobreaker.Settings{ 92 MaxRequests: cbMinSuccess, 93 Interval: cbClosedInterval, 94 Timeout: cbOpenTimeout, 95 IsSuccessful: shouldBeHandledByCaller, 96 ReadyToTrip: func(counts gobreaker.Counts) bool { 97 return counts.ConsecutiveFailures >= cbMaxFailures 98 }, 99 } 100 101 // If the function returns false, the error is counted towards tripping 102 // the open state, when no requests flow through the circuit. Otherwise, 103 // the error handling is returned back the caller. 104 // 105 // In fact, the configuration should only prevent sending requests 106 // to instances that are a-priory unable to process them at the moment, 107 // and we want to avoid time waste. For example, when a service instance 108 // went unavailable for a long period of time, or is not reposing in 109 // timely fashion. 110 // 111 // From the caller perspective, we're converting those to UNAVAILABLE, 112 // thereby allowing the caller to retry the request against another service 113 // instance. 114 // 115 // Note that client-side, internal, and unknown errors are not included: 116 // in case if a request is failing permanently regardless of the service 117 // instance, there is a good chance that all the circuits will be opened 118 // by retries, making the whole service unavailable. 119 // 120 // Next, ResourceExhausted also excluded from the list: as the error is 121 // tenant-request-specific, and the circuit breaker operates connection-wise. 122 func shouldBeHandledByCaller(err error) bool { 123 if errors.Is(err, os.ErrDeadlineExceeded) { 124 return false 125 } 126 if status.Code(err) == codes.Unavailable { 127 return false 128 } 129 // The error handling is returned back the caller: the circuit 130 // remains closed. 131 return true 132 } 133 134 // The default gRPC service config is explicitly set to balance between 135 // instances. 136 const grpcServiceConfig = `{ 137 "healthCheckConfig": { 138 "serviceName": "pyroscope.segment-writer" 139 } 140 }` 141 142 type Client struct { 143 logger log.Logger 144 metrics *metrics 145 146 ring ring.ReadRing 147 pool *connpool.Pool 148 distributor *distributor.Distributor 149 150 service services.Service 151 subservices *services.Manager 152 watcher *services.FailureWatcher 153 } 154 155 func NewSegmentWriterClient( 156 grpcClientConfig grpcclient.Config, 157 logger log.Logger, 158 registry prometheus.Registerer, 159 ring ring.ReadRing, 160 placement placement.Placement, 161 dialOpts ...grpc.DialOption, 162 ) (*Client, error) { 163 pool, err := newConnPool(ring, logger, grpcClientConfig, dialOpts...) 164 if err != nil { 165 return nil, err 166 } 167 c := &Client{ 168 logger: logger, 169 metrics: newMetrics(registry), 170 distributor: distributor.NewDistributor(placement, ring), 171 pool: pool, 172 ring: ring, 173 } 174 c.subservices, err = services.NewManager(c.pool) 175 if err != nil { 176 return nil, fmt.Errorf("services manager: %w", err) 177 } 178 c.watcher = services.NewFailureWatcher() 179 c.watcher.WatchManager(c.subservices) 180 c.service = services.NewBasicService(c.starting, c.running, c.stopping) 181 return c, nil 182 } 183 184 func (c *Client) Service() services.Service { return c.service } 185 186 func (c *Client) starting(ctx context.Context) error { 187 // Warm up connections. The pool does not do this. 188 instances, err := c.ring.GetAllHealthy(ring.Reporting) 189 if err != nil { 190 // The ring might be empty initially if the segment-writer service 191 // is not yet ready. In such cases, we avoid failing the client to 192 // allow for eventual readiness. 193 level.Debug(c.logger).Log("msg", "unable to create connections", "err", err) 194 } else { 195 var wg sync.WaitGroup 196 for _, x := range instances.Instances { 197 wg.Add(1) 198 go func(x ring.InstanceDesc) { 199 defer wg.Done() 200 _, _ = c.pool.GetClientFor(x.Addr) 201 }(x) 202 } 203 wg.Wait() 204 } 205 return services.StartManagerAndAwaitHealthy(ctx, c.subservices) 206 } 207 208 func (c *Client) running(ctx context.Context) error { 209 select { 210 case <-ctx.Done(): 211 return nil 212 case err := <-c.watcher.Chan(): 213 return fmt.Errorf("segement writer client subservice failed: %w", err) 214 } 215 } 216 217 func (c *Client) stopping(_ error) error { 218 return services.StopManagerAndAwaitStopped(context.Background(), c.subservices) 219 } 220 221 func (c *Client) Push( 222 ctx context.Context, 223 req *segmentwriterv1.PushRequest, 224 ) (resp *segmentwriterv1.PushResponse, err error) { 225 k := distributor.NewTenantServiceDatasetKey(req.TenantId, req.Labels...) 226 p, dErr := c.distributor.Distribute(k) 227 if dErr != nil { 228 level.Error(c.logger).Log( 229 "msg", "unable to distribute request", 230 "tenant", req.TenantId, 231 "err", dErr, 232 ) 233 return nil, status.Error(codes.Unavailable, errServiceUnavailableMsg) 234 } 235 236 // In case of a failure, the request is sent to another instance. 237 // At most 5 attempts to push the data to the segment writer. 238 instances := placement.ActiveInstances(p.Instances) 239 req.Shard = p.Shard 240 for attempts := 5; attempts >= 0 && instances.Next(); attempts-- { 241 instance := instances.At() 242 logger := log.With(c.logger, 243 "tenant", req.TenantId, 244 "shard", req.Shard, 245 "instance_addr", instance.Addr, 246 "instance_id", instance.Id, 247 "attempts_left", attempts, 248 ) 249 level.Debug(logger).Log("msg", "sending request") 250 resp, err = c.pushToInstance(ctx, req, instance.Addr) 251 if err == nil { 252 return resp, nil 253 } 254 if isClientError(err) { 255 return nil, err 256 } 257 if !isRetryable(err) { 258 level.Error(logger).Log("msg", "failed to push data to segment writer", "err", err) 259 return nil, status.Error(codes.Unavailable, errServiceUnavailableMsg) 260 } 261 level.Warn(logger).Log("msg", "failed attempt to push data to segment writer", "err", err) 262 if ctxErr := ctx.Err(); ctxErr != nil { 263 return nil, ctxErr 264 } 265 } 266 267 level.Error(c.logger).Log( 268 "msg", "no segment writer instances available for the request", 269 "tenant", req.TenantId, 270 "shard", req.Shard, 271 "last_err", err, 272 ) 273 274 return nil, status.Error(codes.Unavailable, errServiceUnavailableMsg) 275 } 276 277 func (c *Client) pushToInstance( 278 ctx context.Context, 279 req *segmentwriterv1.PushRequest, 280 addr string, 281 ) (*segmentwriterv1.PushResponse, error) { 282 conn, err := c.pool.GetConnFor(addr) 283 if err != nil { 284 return nil, err 285 } 286 // We explicitly force the client to not wait for the connection: 287 // if the connection is not ready, the client will go to the next 288 // instance. 289 client := segmentwriterv1.NewSegmentWriterServiceClient(conn) 290 resp, err := client.Push(ctx, req, grpc.WaitForReady(false)) 291 if err == nil { 292 c.metrics.sentBytes. 293 WithLabelValues(strconv.Itoa(int(req.Shard)), req.TenantId, addr). 294 Observe(float64(len(req.Profile))) 295 } 296 return resp, err 297 } 298 299 func newConnPool( 300 rring ring.ReadRing, 301 logger log.Logger, 302 grpcClientConfig grpcclient.Config, 303 dialOpts ...grpc.DialOption, 304 ) (*connpool.Pool, error) { 305 options, err := grpcClientConfig.DialOption(nil, nil, nil) 306 if err != nil { 307 return nil, err 308 } 309 310 // The options (including interceptors) are shared by all client connections. 311 options = append(options, dialOpts...) 312 options = append(options, 313 grpc.WithDefaultServiceConfig(grpcServiceConfig), 314 // Just in case: we explicitly disable the built-in 315 // retry mechanism of the gRPC client. 316 grpc.WithDisableRetry(), 317 ) 318 319 // Note that circuit breaker must be created per client conn. 320 factory := connpool.NewConnPoolFactory(func(ring.InstanceDesc) []grpc.DialOption { 321 cb := circuitbreaker.UnaryClientInterceptor(gobreaker.NewCircuitBreaker[any](circuitBreakerConfig)) 322 return append(options, grpc.WithUnaryInterceptor(cb)) 323 }) 324 325 p := ring_client.NewPool( 326 "segment-writer", 327 ring_client.PoolConfig{ 328 CheckInterval: poolCleanupPeriod, 329 // Note that health checks are not used: gGRPC health-checking 330 // is done at the gRPC connection level. 331 HealthCheckEnabled: false, 332 HealthCheckTimeout: 0, 333 MaxConcurrentHealthChecks: 0, 334 }, 335 // Discovery is used to remove clients that can't be found 336 // in the ring, including unhealthy instances. CheckInterval 337 // specifies how frequently the stale clients are removed. 338 // Discovery builds a list of healthy instances. 339 // An instance is healthy, if it's heartbeat timestamp 340 // is not older than a configured threshold (intrinsic 341 // to the ring itself). 342 ring_client.NewRingServiceDiscovery(rring), 343 factory, 344 nil, // Client count gauge is not used. 345 logger, 346 ) 347 348 return &connpool.Pool{Pool: p}, nil 349 }