github.com/grafana/pyroscope@v1.18.0/pkg/frontend/frontend.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/frontend/v2/frontend.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 6 package frontend 7 8 import ( 9 "context" 10 "flag" 11 "fmt" 12 "math/rand" 13 "net" 14 "net/http" 15 "strconv" 16 "sync" 17 "time" 18 19 "connectrpc.com/connect" 20 "github.com/go-kit/log" 21 "github.com/go-kit/log/level" 22 "github.com/grafana/dskit/flagext" 23 "github.com/grafana/dskit/grpcclient" 24 "github.com/grafana/dskit/netutil" 25 "github.com/grafana/dskit/services" 26 "github.com/opentracing/opentracing-go" 27 "github.com/pkg/errors" 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/prometheus/client_golang/prometheus/promauto" 30 "go.uber.org/atomic" 31 32 "github.com/grafana/dskit/tenant" 33 34 "github.com/grafana/pyroscope/api/gen/proto/go/vcs/v1/vcsv1connect" 35 "github.com/grafana/pyroscope/pkg/frontend/frontendpb" 36 "github.com/grafana/pyroscope/pkg/frontend/vcs" 37 "github.com/grafana/pyroscope/pkg/querier/stats" 38 "github.com/grafana/pyroscope/pkg/scheduler/schedulerdiscovery" 39 "github.com/grafana/pyroscope/pkg/util/connectgrpc" 40 "github.com/grafana/pyroscope/pkg/util/httpgrpc" 41 "github.com/grafana/pyroscope/pkg/util/httpgrpcutil" 42 "github.com/grafana/pyroscope/pkg/validation" 43 ) 44 45 // Config for a Frontend. 46 type Config struct { 47 SchedulerAddress string `yaml:"scheduler_address" doc:"hidden"` 48 DNSLookupPeriod time.Duration `yaml:"scheduler_dns_lookup_period" category:"advanced" doc:"hidden"` 49 WorkerConcurrency int `yaml:"scheduler_worker_concurrency" category:"advanced"` 50 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the query-frontends and the query-schedulers."` 51 52 // Used to find local IP address, that is sent to scheduler and querier-worker. 53 InfNames []string `yaml:"instance_interface_names" category:"advanced" doc:"default=[<private network interfaces>]"` 54 Addr string `yaml:"instance_addr" category:"advanced"` 55 EnableIPv6 bool `yaml:"instance_enable_ipv6" category:"advanced"` 56 Port int `yaml:"instance_port" category:"advanced"` 57 58 // For backward compatibility only. The parameter has a name that is 59 // inconsistent with the way address is specified in other places. 60 // The parameter is replaced with `instance_addr`. 61 AddrOld string `yaml:"address" category:"advanced" doc:"hidden"` 62 63 // This configuration is injected internally. 64 QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"` 65 MaxLoopDuration time.Duration `yaml:"-"` 66 } 67 68 func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { 69 f.IntVar(&cfg.WorkerConcurrency, "query-frontend.scheduler-worker-concurrency", 5, "Number of concurrent workers forwarding queries to single query-scheduler.") 70 71 cfg.InfNames = netutil.PrivateNetworkInterfacesWithFallback([]string{"eth0", "en0"}, logger) 72 f.Var((*flagext.StringSlice)(&cfg.InfNames), "query-frontend.instance-interface-names", "List of network interface names to look up when finding the instance IP address. This address is sent to query-scheduler and querier, which uses it to send the query response back to query-frontend.") 73 f.StringVar(&cfg.Addr, "query-frontend.instance-addr", "", "IP address to advertise to the querier (via scheduler) (default is auto-detected from network interfaces).") 74 f.BoolVar(&cfg.EnableIPv6, "query-frontend.instance-enable-ipv6", false, "Enable using a IPv6 instance address. (default false)") 75 f.IntVar(&cfg.Port, "query-frontend.instance-port", 0, "Port to advertise to query-scheduler and querier (defaults to -server.http-listen-port).") 76 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("query-frontend.grpc-client-config", f) 77 } 78 79 func (cfg *Config) Validate() error { 80 if cfg.QuerySchedulerDiscovery.Mode == schedulerdiscovery.ModeRing && cfg.SchedulerAddress != "" { 81 return fmt.Errorf("scheduler address cannot be specified when query-scheduler service discovery mode is set to '%s'", cfg.QuerySchedulerDiscovery.Mode) 82 } 83 84 return cfg.GRPCClientConfig.Validate() 85 } 86 87 // Frontend implements GrpcRoundTripper. It queues HTTP requests, 88 // dispatches them to backends via gRPC, and handles retries for requests which failed. 89 type Frontend struct { 90 services.Service 91 connectgrpc.GRPCRoundTripper 92 vcsv1connect.VCSServiceHandler 93 frontendpb.UnimplementedFrontendForQuerierServer 94 95 cfg Config 96 log log.Logger 97 98 lastQueryID atomic.Uint64 99 100 // frontend workers will read from this channel, and send request to scheduler. 101 requestsCh chan *frontendRequest 102 103 limits Limits 104 schedulerWorkers *frontendSchedulerWorkers 105 schedulerWorkersWatcher *services.FailureWatcher 106 requests *requestsInProgress 107 } 108 109 type Limits interface { 110 QuerySplitDuration(string) time.Duration 111 MaxQueryParallelism(string) int 112 MaxQueryLength(tenantID string) time.Duration 113 MaxQueryLookback(tenantID string) time.Duration 114 QueryAnalysisEnabled(string) bool 115 SymbolizerEnabled(string) bool 116 QuerySanitizeOnMerge(string) bool 117 validation.FlameGraphLimits 118 } 119 120 type frontendRequest struct { 121 queryID uint64 122 request *httpgrpc.HTTPRequest 123 userID string 124 statsEnabled bool 125 126 cancel context.CancelFunc 127 128 enqueue chan enqueueResult 129 response chan *frontendpb.QueryResultRequest 130 } 131 132 type enqueueStatus int 133 134 const ( 135 // Sent to scheduler successfully, and frontend should wait for response now. 136 waitForResponse enqueueStatus = iota 137 138 // Failed to forward request to scheduler, frontend will try again. 139 failed 140 ) 141 142 type enqueueResult struct { 143 status enqueueStatus 144 145 cancelCh chan<- uint64 // Channel that can be used for request cancellation. If nil, cancellation is not possible. 146 } 147 148 // NewFrontend creates a new frontend. 149 func NewFrontend(cfg Config, limits Limits, log log.Logger, reg prometheus.Registerer) (*Frontend, error) { 150 requestsCh := make(chan *frontendRequest) 151 152 schedulerWorkers, err := newFrontendSchedulerWorkers(cfg, net.JoinHostPort(cfg.Addr, strconv.Itoa(cfg.Port)), requestsCh, log, reg) 153 if err != nil { 154 return nil, err 155 } 156 157 f := &Frontend{ 158 cfg: cfg, 159 log: log, 160 limits: limits, 161 requestsCh: requestsCh, 162 schedulerWorkers: schedulerWorkers, 163 schedulerWorkersWatcher: services.NewFailureWatcher(), 164 requests: newRequestsInProgress(), 165 VCSServiceHandler: vcs.New(log, reg), 166 } 167 f.GRPCRoundTripper = &realFrontendRoundTripper{frontend: f} 168 // Randomize to avoid getting responses from queries sent before restart, which could lead to mixing results 169 // between different queries. Note that frontend verifies the user, so it cannot leak results between tenants. 170 // This isn't perfect, but better than nothing. 171 f.lastQueryID.Store(rand.Uint64()) 172 173 promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 174 Name: "pyroscope_query_frontend_queries_in_progress", 175 Help: "Number of queries in progress handled by this frontend.", 176 }, func() float64 { 177 return float64(f.requests.count()) 178 }) 179 180 promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 181 Name: "pyroscope_query_frontend_connected_schedulers", 182 Help: "Number of schedulers this frontend is connected to.", 183 }, func() float64 { 184 return float64(f.schedulerWorkers.getWorkersCount()) 185 }) 186 187 f.Service = services.NewBasicService(f.starting, f.running, f.stopping) 188 return f, nil 189 } 190 191 func (f *Frontend) starting(ctx context.Context) error { 192 f.schedulerWorkersWatcher.WatchService(f.schedulerWorkers) 193 194 return errors.Wrap(services.StartAndAwaitRunning(ctx, f.schedulerWorkers), "failed to start frontend scheduler workers") 195 } 196 197 func (f *Frontend) running(ctx context.Context) error { 198 select { 199 case <-ctx.Done(): 200 return nil 201 case err := <-f.schedulerWorkersWatcher.Chan(): 202 return errors.Wrap(err, "query-frontend subservice failed") 203 } 204 } 205 206 func (f *Frontend) stopping(_ error) error { 207 return errors.Wrap(services.StopAndAwaitTerminated(context.Background(), f.schedulerWorkers), "failed to stop frontend scheduler workers") 208 } 209 210 // allow to test the frontend without the need of a real roundertripper 211 type realFrontendRoundTripper struct { 212 frontend *Frontend 213 } 214 215 // RoundTripGRPC round trips a proto (instead of an HTTP request). 216 func (rt *realFrontendRoundTripper) RoundTripGRPC(ctx context.Context, req *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) { 217 f := rt.frontend 218 219 if s := f.State(); s != services.Running { 220 return nil, fmt.Errorf("frontend not running: %v", s) 221 } 222 223 tenantIDs, err := tenant.TenantIDs(ctx) 224 if err != nil { 225 return nil, err 226 } 227 userID := tenant.JoinTenantIDs(tenantIDs) 228 229 // Propagate trace context in gRPC too - this will be ignored if using HTTP. 230 tracer, span := opentracing.GlobalTracer(), opentracing.SpanFromContext(ctx) 231 if tracer != nil && span != nil { 232 carrier := (*httpgrpcutil.HttpgrpcHeadersCarrier)(req) 233 if err := tracer.Inject(span.Context(), opentracing.HTTPHeaders, carrier); err != nil { 234 return nil, err 235 } 236 } 237 238 ctx, cancel := context.WithCancel(ctx) 239 defer cancel() 240 241 freq := &frontendRequest{ 242 queryID: f.lastQueryID.Inc(), 243 request: req, 244 userID: userID, 245 statsEnabled: stats.IsEnabled(ctx), 246 247 cancel: cancel, 248 249 // Buffer of 1 to ensure response or error can be written to the channel 250 // even if this goroutine goes away due to client context cancellation. 251 enqueue: make(chan enqueueResult, 1), 252 response: make(chan *frontendpb.QueryResultRequest, 1), 253 } 254 255 f.requests.put(freq) 256 defer f.requests.delete(freq.queryID) 257 258 retries := f.cfg.WorkerConcurrency + 1 // To make sure we hit at least two different schedulers. 259 260 enqueueAgain: 261 var cancelCh chan<- uint64 262 select { 263 case <-ctx.Done(): 264 return nil, ctx.Err() 265 266 case f.requestsCh <- freq: 267 // Enqueued, let's wait for response. 268 enqRes := <-freq.enqueue 269 if enqRes.status == waitForResponse { 270 cancelCh = enqRes.cancelCh 271 break // go wait for response. 272 } else if enqRes.status == failed { 273 retries-- 274 if retries > 0 { 275 goto enqueueAgain 276 } 277 } 278 279 return nil, httpgrpc.Errorf(http.StatusInternalServerError, "failed to enqueue request") 280 } 281 282 select { 283 case <-ctx.Done(): 284 if cancelCh != nil { 285 select { 286 case cancelCh <- freq.queryID: 287 // cancellation sent. 288 default: 289 // failed to cancel, ignore. 290 level.Warn(f.log).Log("msg", "failed to send cancellation request to scheduler, queue full") 291 } 292 } 293 return nil, ctx.Err() 294 295 case resp := <-freq.response: 296 if stats.ShouldTrackHTTPGRPCResponse(resp.HttpResponse) { 297 stats.FromContext(ctx).Merge(resp.Stats) // Safe if stats is nil. 298 } 299 300 return resp.HttpResponse, nil 301 } 302 } 303 304 func (f *Frontend) QueryResult(ctx context.Context, r *connect.Request[frontendpb.QueryResultRequest]) (*connect.Response[frontendpb.QueryResultResponse], error) { 305 qrReq := r.Msg 306 tenantIDs, err := tenant.TenantIDs(ctx) 307 if err != nil { 308 return nil, err 309 } 310 userID := tenant.JoinTenantIDs(tenantIDs) 311 312 req := f.requests.get(qrReq.QueryID) 313 // It is possible that some old response belonging to different user was received, if frontend has restarted. 314 // To avoid leaking query results between users, we verify the user here. 315 // To avoid mixing results from different queries, we randomize queryID counter on start. 316 if req != nil && req.userID == userID { 317 select { 318 case req.response <- qrReq: 319 // Should always be possible, unless QueryResult is called multiple times with the same queryID. 320 default: 321 level.Warn(f.log).Log("msg", "failed to write query result to the response channel", "queryID", qrReq.QueryID, "tenant", userID) 322 } 323 } 324 325 return connect.NewResponse(&frontendpb.QueryResultResponse{}), nil 326 } 327 328 // CheckReady determines if the query frontend is ready. Function parameters/return 329 // chosen to match the same method in the ingester 330 func (f *Frontend) CheckReady(_ context.Context) error { 331 workers := f.schedulerWorkers.getWorkersCount() 332 333 // If frontend is connected to at least one scheduler, we are ready. 334 if workers > 0 { 335 return nil 336 } 337 338 msg := fmt.Sprintf("not ready: number of schedulers this worker is connected to is %d", workers) 339 level.Info(f.log).Log("msg", msg) 340 return errors.New(msg) 341 } 342 343 type requestsInProgress struct { 344 mu sync.Mutex 345 requests map[uint64]*frontendRequest 346 } 347 348 func newRequestsInProgress() *requestsInProgress { 349 return &requestsInProgress{ 350 requests: map[uint64]*frontendRequest{}, 351 } 352 } 353 354 func (r *requestsInProgress) count() int { 355 r.mu.Lock() 356 defer r.mu.Unlock() 357 358 return len(r.requests) 359 } 360 361 func (r *requestsInProgress) put(req *frontendRequest) { 362 r.mu.Lock() 363 defer r.mu.Unlock() 364 365 r.requests[req.queryID] = req 366 } 367 368 func (r *requestsInProgress) delete(queryID uint64) { 369 r.mu.Lock() 370 defer r.mu.Unlock() 371 372 delete(r.requests, queryID) 373 } 374 375 func (r *requestsInProgress) get(queryID uint64) *frontendRequest { 376 r.mu.Lock() 377 defer r.mu.Unlock() 378 379 return r.requests[queryID] 380 }