github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/querier/worker/worker.go (about)

     1  package worker
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"os"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/go-kit/log"
    11  	"github.com/go-kit/log/level"
    12  	"github.com/grafana/dskit/grpcclient"
    13  	"github.com/grafana/dskit/services"
    14  	"github.com/pkg/errors"
    15  	"github.com/prometheus/client_golang/prometheus"
    16  	"github.com/weaveworks/common/httpgrpc"
    17  	"google.golang.org/grpc"
    18  
    19  	"github.com/cortexproject/cortex/pkg/util"
    20  )
    21  
    22  type Config struct {
    23  	FrontendAddress  string        `yaml:"frontend_address"`
    24  	SchedulerAddress string        `yaml:"scheduler_address"`
    25  	DNSLookupPeriod  time.Duration `yaml:"dns_lookup_duration"`
    26  
    27  	Parallelism           int  `yaml:"parallelism"`
    28  	MatchMaxConcurrency   bool `yaml:"match_max_concurrent"`
    29  	MaxConcurrentRequests int  `yaml:"-"` // Must be same as passed to PromQL Engine.
    30  
    31  	QuerierID string `yaml:"id"`
    32  
    33  	GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"`
    34  }
    35  
    36  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    37  	f.StringVar(&cfg.SchedulerAddress, "querier.scheduler-address", "", "Hostname (and port) of scheduler that querier will periodically resolve, connect to and receive queries from. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.")
    38  	f.StringVar(&cfg.FrontendAddress, "querier.frontend-address", "", "Address of query frontend service, in host:port format. If -querier.scheduler-address is set as well, querier will use scheduler instead. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.")
    39  
    40  	f.DurationVar(&cfg.DNSLookupPeriod, "querier.dns-lookup-period", 10*time.Second, "How often to query DNS for query-frontend or query-scheduler address.")
    41  
    42  	f.IntVar(&cfg.Parallelism, "querier.worker-parallelism", 10, "Number of simultaneous queries to process per query-frontend or query-scheduler.")
    43  	f.BoolVar(&cfg.MatchMaxConcurrency, "querier.worker-match-max-concurrent", false, "Force worker concurrency to match the -querier.max-concurrent option. Overrides querier.worker-parallelism.")
    44  	f.StringVar(&cfg.QuerierID, "querier.id", "", "Querier ID, sent to frontend service to identify requests from the same querier. Defaults to hostname.")
    45  
    46  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix("querier.frontend-client", f)
    47  }
    48  
    49  func (cfg *Config) Validate(log log.Logger) error {
    50  	if cfg.FrontendAddress != "" && cfg.SchedulerAddress != "" {
    51  		return errors.New("frontend address and scheduler address are mutually exclusive, please use only one")
    52  	}
    53  	return cfg.GRPCClientConfig.Validate(log)
    54  }
    55  
    56  // Handler for HTTP requests wrapped in protobuf messages.
    57  type RequestHandler interface {
    58  	Handle(context.Context, *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error)
    59  }
    60  
    61  // Single processor handles all streaming operations to query-frontend or query-scheduler to fetch queries
    62  // and process them.
    63  type processor interface {
    64  	// Each invocation of processQueriesOnSingleStream starts new streaming operation to query-frontend
    65  	// or query-scheduler to fetch queries and execute them.
    66  	//
    67  	// This method must react on context being finished, and stop when that happens.
    68  	//
    69  	// processorManager (not processor) is responsible for starting as many goroutines as needed for each connection.
    70  	processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string)
    71  
    72  	// notifyShutdown notifies the remote query-frontend or query-scheduler that the querier is
    73  	// shutting down.
    74  	notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string)
    75  }
    76  
    77  type querierWorker struct {
    78  	*services.BasicService
    79  
    80  	cfg Config
    81  	log log.Logger
    82  
    83  	processor processor
    84  
    85  	subservices *services.Manager
    86  
    87  	mu sync.Mutex
    88  	// Set to nil when stop is called... no more managers are created afterwards.
    89  	managers map[string]*processorManager
    90  }
    91  
    92  func NewQuerierWorker(cfg Config, handler RequestHandler, log log.Logger, reg prometheus.Registerer) (services.Service, error) {
    93  	if cfg.QuerierID == "" {
    94  		hostname, err := os.Hostname()
    95  		if err != nil {
    96  			return nil, errors.Wrap(err, "failed to get hostname for configuring querier ID")
    97  		}
    98  		cfg.QuerierID = hostname
    99  	}
   100  
   101  	var processor processor
   102  	var servs []services.Service
   103  	var address string
   104  
   105  	switch {
   106  	case cfg.SchedulerAddress != "":
   107  		level.Info(log).Log("msg", "Starting querier worker connected to query-scheduler", "scheduler", cfg.SchedulerAddress)
   108  
   109  		address = cfg.SchedulerAddress
   110  		processor, servs = newSchedulerProcessor(cfg, handler, log, reg)
   111  
   112  	case cfg.FrontendAddress != "":
   113  		level.Info(log).Log("msg", "Starting querier worker connected to query-frontend", "frontend", cfg.FrontendAddress)
   114  
   115  		address = cfg.FrontendAddress
   116  		processor = newFrontendProcessor(cfg, handler, log)
   117  
   118  	default:
   119  		return nil, errors.New("no query-scheduler or query-frontend address")
   120  	}
   121  
   122  	return newQuerierWorkerWithProcessor(cfg, log, processor, address, servs)
   123  }
   124  
   125  func newQuerierWorkerWithProcessor(cfg Config, log log.Logger, processor processor, address string, servs []services.Service) (*querierWorker, error) {
   126  	f := &querierWorker{
   127  		cfg:       cfg,
   128  		log:       log,
   129  		managers:  map[string]*processorManager{},
   130  		processor: processor,
   131  	}
   132  
   133  	// Empty address is only used in tests, where individual targets are added manually.
   134  	if address != "" {
   135  		w, err := util.NewDNSWatcher(address, cfg.DNSLookupPeriod, f)
   136  		if err != nil {
   137  			return nil, err
   138  		}
   139  
   140  		servs = append(servs, w)
   141  	}
   142  
   143  	if len(servs) > 0 {
   144  		subservices, err := services.NewManager(servs...)
   145  		if err != nil {
   146  			return nil, errors.Wrap(err, "querier worker subservices")
   147  		}
   148  
   149  		f.subservices = subservices
   150  	}
   151  
   152  	f.BasicService = services.NewIdleService(f.starting, f.stopping)
   153  	return f, nil
   154  }
   155  
   156  func (w *querierWorker) starting(ctx context.Context) error {
   157  	if w.subservices == nil {
   158  		return nil
   159  	}
   160  	return services.StartManagerAndAwaitHealthy(ctx, w.subservices)
   161  }
   162  
   163  func (w *querierWorker) stopping(_ error) error {
   164  	// Stop all goroutines fetching queries. Note that in Stopping state,
   165  	// worker no longer creates new managers in AddressAdded method.
   166  	w.mu.Lock()
   167  	for _, m := range w.managers {
   168  		m.stop()
   169  	}
   170  	w.mu.Unlock()
   171  
   172  	if w.subservices == nil {
   173  		return nil
   174  	}
   175  
   176  	// Stop DNS watcher and services used by processor.
   177  	return services.StopManagerAndAwaitStopped(context.Background(), w.subservices)
   178  }
   179  
   180  func (w *querierWorker) AddressAdded(address string) {
   181  	ctx := w.ServiceContext()
   182  	if ctx == nil || ctx.Err() != nil {
   183  		return
   184  	}
   185  
   186  	w.mu.Lock()
   187  	defer w.mu.Unlock()
   188  
   189  	if m := w.managers[address]; m != nil {
   190  		return
   191  	}
   192  
   193  	level.Info(w.log).Log("msg", "adding connection", "addr", address)
   194  	conn, err := w.connect(context.Background(), address)
   195  	if err != nil {
   196  		level.Error(w.log).Log("msg", "error connecting", "addr", address, "err", err)
   197  		return
   198  	}
   199  
   200  	w.managers[address] = newProcessorManager(ctx, w.processor, conn, address)
   201  	// Called with lock.
   202  	w.resetConcurrency()
   203  }
   204  
   205  func (w *querierWorker) AddressRemoved(address string) {
   206  	level.Info(w.log).Log("msg", "removing connection", "addr", address)
   207  
   208  	w.mu.Lock()
   209  	p := w.managers[address]
   210  	delete(w.managers, address)
   211  	// Called with lock.
   212  	w.resetConcurrency()
   213  	w.mu.Unlock()
   214  
   215  	if p != nil {
   216  		p.stop()
   217  	}
   218  }
   219  
   220  // Must be called with lock.
   221  func (w *querierWorker) resetConcurrency() {
   222  	totalConcurrency := 0
   223  	index := 0
   224  
   225  	for _, m := range w.managers {
   226  		concurrency := 0
   227  
   228  		if w.cfg.MatchMaxConcurrency {
   229  			concurrency = w.cfg.MaxConcurrentRequests / len(w.managers)
   230  
   231  			// If max concurrency does not evenly divide into our frontends a subset will be chosen
   232  			// to receive an extra connection.  Frontend addresses were shuffled above so this will be a
   233  			// random selection of frontends.
   234  			if index < w.cfg.MaxConcurrentRequests%len(w.managers) {
   235  				level.Warn(w.log).Log("msg", "max concurrency is not evenly divisible across targets, adding an extra connection", "addr", m.address)
   236  				concurrency++
   237  			}
   238  		} else {
   239  			concurrency = w.cfg.Parallelism
   240  		}
   241  
   242  		// If concurrency is 0 then MaxConcurrentRequests is less than the total number of
   243  		// frontends/schedulers. In order to prevent accidentally starving a frontend or scheduler we are just going to
   244  		// always connect once to every target.  This is dangerous b/c we may start exceeding PromQL
   245  		// max concurrency.
   246  		if concurrency == 0 {
   247  			concurrency = 1
   248  		}
   249  
   250  		totalConcurrency += concurrency
   251  		m.concurrency(concurrency)
   252  		index++
   253  	}
   254  
   255  	if totalConcurrency > w.cfg.MaxConcurrentRequests {
   256  		level.Warn(w.log).Log("msg", "total worker concurrency is greater than promql max concurrency. Queries may be queued in the querier which reduces QOS")
   257  	}
   258  }
   259  
   260  func (w *querierWorker) connect(ctx context.Context, address string) (*grpc.ClientConn, error) {
   261  	// Because we only use single long-running method, it doesn't make sense to inject user ID, send over tracing or add metrics.
   262  	opts, err := w.cfg.GRPCClientConfig.DialOption(nil, nil)
   263  	if err != nil {
   264  		return nil, err
   265  	}
   266  
   267  	conn, err := grpc.DialContext(ctx, address, opts...)
   268  	if err != nil {
   269  		return nil, err
   270  	}
   271  	return conn, nil
   272  }