github.com/grafana/pyroscope@v1.18.0/pkg/querier/worker/worker.go (about)

     1  // SPDX-License-Identifier: AGPL-3.0-only
     2  // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/querier/worker/worker.go
     3  // Provenance-includes-license: Apache-2.0
     4  // Provenance-includes-copyright: The Cortex Authors.
     5  
     6  package worker
     7  
     8  import (
     9  	"context"
    10  	"flag"
    11  	"fmt"
    12  	"os"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/go-kit/log"
    17  	"github.com/go-kit/log/level"
    18  	"github.com/grafana/dskit/grpcclient"
    19  	"github.com/grafana/dskit/services"
    20  	"github.com/pkg/errors"
    21  	"github.com/prometheus/client_golang/prometheus"
    22  	"google.golang.org/grpc"
    23  
    24  	"github.com/grafana/pyroscope/pkg/scheduler/schedulerdiscovery"
    25  	"github.com/grafana/pyroscope/pkg/util/httpgrpc"
    26  	"github.com/grafana/pyroscope/pkg/util/servicediscovery"
    27  )
    28  
    29  type Config struct {
    30  	SchedulerAddress string            `yaml:"scheduler_address" doc:"hidden"`
    31  	DNSLookupPeriod  time.Duration     `yaml:"dns_lookup_duration" category:"advanced" doc:"hidden"`
    32  	QuerierID        string            `yaml:"id" category:"advanced"`
    33  	GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the queriers and the query-frontends / query-schedulers."`
    34  
    35  	MaxConcurrent int `yaml:"max_concurrent" category:"advanced"`
    36  
    37  	// This configuration is injected internally.
    38  	QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"`
    39  	MaxLoopDuration         time.Duration             `yaml:"-"`
    40  }
    41  
    42  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    43  	f.StringVar(&cfg.QuerierID, "querier.id", "", "Querier ID, sent to the query-frontend to identify requests from the same querier. Defaults to hostname.")
    44  	f.IntVar(&cfg.MaxConcurrent, "querier.max-concurrent", 4, "The maximum number of concurrent queries allowed.")
    45  
    46  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix("querier.frontend-client", f)
    47  }
    48  
    49  func (cfg *Config) Validate(log log.Logger) error {
    50  	if cfg.QuerySchedulerDiscovery.Mode == schedulerdiscovery.ModeRing && cfg.SchedulerAddress != "" {
    51  		return fmt.Errorf("scheduler address cannot be specified when query-scheduler service discovery mode is set to '%s'", cfg.QuerySchedulerDiscovery.Mode)
    52  	}
    53  
    54  	return cfg.GRPCClientConfig.Validate()
    55  }
    56  
    57  // RequestHandler for HTTP requests wrapped in protobuf messages.
    58  type RequestHandler interface {
    59  	Handle(context.Context, *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error)
    60  }
    61  
    62  // Single processor handles all streaming operations to query-frontend or query-scheduler to fetch queries
    63  // and process them.
    64  type processor interface {
    65  	// Each invocation of processQueriesOnSingleStream starts new streaming operation to query-frontend
    66  	// or query-scheduler to fetch queries and execute them.
    67  	//
    68  	// This method must react on context being finished, and stop when that happens.
    69  	//
    70  	// processorManager (not processor) is responsible for starting as many goroutines as needed for each connection.
    71  	processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string)
    72  
    73  	// notifyShutdown notifies the remote query-frontend or query-scheduler that the querier is
    74  	// shutting down.
    75  	notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string)
    76  }
    77  
    78  // serviceDiscoveryFactory makes a new service discovery instance.
    79  type serviceDiscoveryFactory func(receiver servicediscovery.Notifications) (services.Service, error)
    80  
    81  type querierWorker struct {
    82  	*services.BasicService
    83  
    84  	cfg Config
    85  	log log.Logger
    86  
    87  	processor processor
    88  
    89  	// Subservices manager.
    90  	subservices        *services.Manager
    91  	subservicesWatcher *services.FailureWatcher
    92  
    93  	mu        sync.Mutex
    94  	managers  map[string]*processorManager
    95  	instances map[string]servicediscovery.Instance
    96  }
    97  
    98  func NewQuerierWorker(cfg Config, handler RequestHandler, log log.Logger, reg prometheus.Registerer) (services.Service, error) {
    99  	if cfg.QuerierID == "" {
   100  		hostname, err := os.Hostname()
   101  		if err != nil {
   102  			return nil, errors.Wrap(err, "failed to get hostname for configuring querier ID")
   103  		}
   104  		cfg.QuerierID = hostname
   105  	}
   106  
   107  	var processor processor
   108  	var servs []services.Service
   109  	var factory serviceDiscoveryFactory
   110  
   111  	switch {
   112  	case cfg.SchedulerAddress != "" || cfg.QuerySchedulerDiscovery.Mode == schedulerdiscovery.ModeRing:
   113  		level.Info(log).Log("msg", "Starting querier worker connected to query-scheduler", "scheduler", cfg.SchedulerAddress)
   114  
   115  		factory = func(receiver servicediscovery.Notifications) (services.Service, error) {
   116  			return schedulerdiscovery.New(cfg.QuerySchedulerDiscovery, cfg.SchedulerAddress, cfg.DNSLookupPeriod, "querier", receiver, log, reg)
   117  		}
   118  
   119  		processor, servs = newSchedulerProcessor(cfg, handler, log, reg)
   120  
   121  	default:
   122  		return nil, errors.New("no query-scheduler or query-frontend address")
   123  	}
   124  
   125  	return newQuerierWorkerWithProcessor(cfg, log, processor, factory, servs)
   126  }
   127  
   128  func newQuerierWorkerWithProcessor(cfg Config, log log.Logger, processor processor, newServiceDiscovery serviceDiscoveryFactory, servs []services.Service) (*querierWorker, error) {
   129  	f := &querierWorker{
   130  		cfg:       cfg,
   131  		log:       log,
   132  		managers:  map[string]*processorManager{},
   133  		instances: map[string]servicediscovery.Instance{},
   134  		processor: processor,
   135  	}
   136  
   137  	// There's no service discovery in some tests.
   138  	if newServiceDiscovery != nil {
   139  		w, err := newServiceDiscovery(f)
   140  		if err != nil {
   141  			return nil, err
   142  		}
   143  
   144  		servs = append(servs, w)
   145  	}
   146  
   147  	if len(servs) > 0 {
   148  		subservices, err := services.NewManager(servs...)
   149  		if err != nil {
   150  			return nil, errors.Wrap(err, "querier worker subservices")
   151  		}
   152  
   153  		f.subservices = subservices
   154  		f.subservicesWatcher = services.NewFailureWatcher()
   155  	}
   156  
   157  	f.BasicService = services.NewBasicService(f.starting, f.running, f.stopping)
   158  	return f, nil
   159  }
   160  
   161  func (w *querierWorker) starting(ctx context.Context) error {
   162  	if w.subservices == nil {
   163  		return nil
   164  	}
   165  
   166  	w.subservicesWatcher.WatchManager(w.subservices)
   167  	return services.StartManagerAndAwaitHealthy(ctx, w.subservices)
   168  }
   169  
   170  func (w *querierWorker) running(ctx context.Context) error {
   171  	select {
   172  	case <-ctx.Done():
   173  		return nil
   174  	case err := <-w.subservicesWatcher.Chan(): // The channel will be nil if w.subservicesWatcher is not set.
   175  		return errors.Wrap(err, "querier worker subservice failed")
   176  	}
   177  }
   178  
   179  func (w *querierWorker) stopping(_ error) error {
   180  	// Stop all goroutines fetching queries. Note that in Stopping state,
   181  	// worker no longer creates new managers in InstanceAdded method.
   182  	w.mu.Lock()
   183  	for address, m := range w.managers {
   184  		m.stop()
   185  
   186  		delete(w.managers, address)
   187  		delete(w.instances, address)
   188  	}
   189  	w.mu.Unlock()
   190  
   191  	if w.subservices == nil {
   192  		return nil
   193  	}
   194  
   195  	// Stop service discovery and services used by processor.
   196  	return services.StopManagerAndAwaitStopped(context.Background(), w.subservices)
   197  }
   198  
   199  func (w *querierWorker) InstanceAdded(instance servicediscovery.Instance) {
   200  	w.mu.Lock()
   201  	defer w.mu.Unlock()
   202  
   203  	// Ensure the querier worker hasn't been stopped (or is stopping).
   204  	// This check is done inside the lock, to avoid any race condition with the stopping() function.
   205  	ctx := w.ServiceContext()
   206  	if ctx == nil || ctx.Err() != nil {
   207  		return
   208  	}
   209  
   210  	address := instance.Address
   211  	if m := w.managers[address]; m != nil {
   212  		return
   213  	}
   214  
   215  	level.Info(w.log).Log("msg", "adding connection", "addr", address, "in-use", instance.InUse)
   216  	conn, err := w.connect(context.Background(), address)
   217  	if err != nil {
   218  		level.Error(w.log).Log("msg", "error connecting", "addr", address, "err", err)
   219  		return
   220  	}
   221  
   222  	w.managers[address] = newProcessorManager(ctx, w.processor, conn, address)
   223  	w.instances[address] = instance
   224  
   225  	// Called with lock.
   226  	w.resetConcurrency()
   227  }
   228  
   229  func (w *querierWorker) InstanceRemoved(instance servicediscovery.Instance) {
   230  	address := instance.Address
   231  
   232  	level.Info(w.log).Log("msg", "removing connection", "addr", address, "in-use", instance.InUse)
   233  
   234  	w.mu.Lock()
   235  	p := w.managers[address]
   236  	delete(w.managers, address)
   237  	delete(w.instances, address)
   238  	w.mu.Unlock()
   239  
   240  	if p != nil {
   241  		p.stop()
   242  	}
   243  
   244  	// Re-balance the connections between the available query-frontends / query-schedulers.
   245  	w.mu.Lock()
   246  	w.resetConcurrency()
   247  	w.mu.Unlock()
   248  }
   249  
   250  func (w *querierWorker) InstanceChanged(instance servicediscovery.Instance) {
   251  	w.mu.Lock()
   252  	defer w.mu.Unlock()
   253  
   254  	// Ensure the querier worker hasn't been stopped (or is stopping).
   255  	// This check is done inside the lock, to avoid any race condition with the stopping() function.
   256  	ctx := w.ServiceContext()
   257  	if ctx == nil || ctx.Err() != nil {
   258  		return
   259  	}
   260  
   261  	// Ensure there's a manager for the instance. If there's no, then it's a bug.
   262  	if m := w.managers[instance.Address]; m == nil {
   263  		level.Error(w.log).Log("msg", "received a notification about an unknown backend instance", "addr", instance.Address, "in-use", instance.InUse)
   264  		return
   265  	}
   266  
   267  	level.Info(w.log).Log("msg", "updating connection", "addr", instance.Address, "in-use", instance.InUse)
   268  
   269  	// Update instance and adjust concurrency.
   270  	w.instances[instance.Address] = instance
   271  
   272  	// Called with lock.
   273  	w.resetConcurrency()
   274  }
   275  
   276  // Must be called with lock.
   277  func (w *querierWorker) resetConcurrency() {
   278  	desiredConcurrency := w.getDesiredConcurrency()
   279  
   280  	for _, m := range w.managers {
   281  		concurrency, ok := desiredConcurrency[m.address]
   282  		if !ok {
   283  			// This error should never happen. If it does, it means there's a bug in the code.
   284  			level.Error(w.log).Log("msg", "a querier worker is connected to an unknown remote endpoint", "addr", m.address)
   285  
   286  			// Consider it as not in-use.
   287  			concurrency = 1
   288  		}
   289  
   290  		m.concurrency(concurrency)
   291  	}
   292  }
   293  
   294  // getDesiredConcurrency returns the number of desired connections for each discovered query-frontend / query-scheduler instance.
   295  // Must be called with lock.
   296  func (w *querierWorker) getDesiredConcurrency() map[string]int {
   297  	// Count the number of in-use instances.
   298  	numInUse := 0
   299  	for _, instance := range w.instances {
   300  		if instance.InUse {
   301  			numInUse++
   302  		}
   303  	}
   304  
   305  	var (
   306  		desired    = make(map[string]int, len(w.instances))
   307  		inUseIndex = 0
   308  	)
   309  
   310  	// Compute the number of desired connections for each discovered instance.
   311  	for address, instance := range w.instances {
   312  		// Run only 1 worker for each instance not in-use, to allow for the queues
   313  		// to be drained when the in-use instances change or if, for any reason,
   314  		// queries are enqueued on the ones not in-use.
   315  		if !instance.InUse {
   316  			desired[address] = 1
   317  			continue
   318  		}
   319  
   320  		concurrency := w.cfg.MaxConcurrent / numInUse
   321  
   322  		// If max concurrency does not evenly divide into in-use instances, then a subset will be chosen
   323  		// to receive an extra connection. Since we're iterating a map (whose iteration order is not guaranteed),
   324  		// then this should practically select a random address for the extra connection.
   325  		if inUseIndex < w.cfg.MaxConcurrent%numInUse {
   326  			level.Warn(w.log).Log("msg", "max concurrency is not evenly divisible across targets, adding an extra connection", "addr", address)
   327  			concurrency++
   328  		}
   329  
   330  		// If concurrency is 0 then MaxConcurrentRequests is less than the total number of
   331  		// frontends/schedulers. In order to prevent accidentally starving a frontend or scheduler we are just going to
   332  		// always connect once to every target.
   333  		if concurrency == 0 {
   334  			concurrency = 1
   335  		}
   336  
   337  		desired[address] = concurrency
   338  		inUseIndex++
   339  	}
   340  
   341  	return desired
   342  }
   343  
   344  func (w *querierWorker) connect(ctx context.Context, address string) (*grpc.ClientConn, error) {
   345  	// Because we only use single long-running method, it doesn't make sense to inject user ID, send over tracing or add metrics.
   346  	opts, err := w.cfg.GRPCClientConfig.DialOption(nil, nil, nil)
   347  	if err != nil {
   348  		return nil, err
   349  	}
   350  
   351  	conn, err := grpc.DialContext(ctx, address, opts...)
   352  	if err != nil {
   353  		return nil, err
   354  	}
   355  	return conn, nil
   356  }