github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/querier/worker/worker.go (about)

     1  package worker
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"os"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/go-kit/log"
    11  	"github.com/go-kit/log/level"
    12  	"github.com/grafana/dskit/grpcclient"
    13  	"github.com/grafana/dskit/ring"
    14  	"github.com/grafana/dskit/services"
    15  	"github.com/pkg/errors"
    16  	"github.com/prometheus/client_golang/prometheus"
    17  	"github.com/weaveworks/common/httpgrpc"
    18  	"google.golang.org/grpc"
    19  
    20  	"github.com/grafana/loki/pkg/util"
    21  	lokiutil "github.com/grafana/loki/pkg/util"
    22  )
    23  
    24  type Config struct {
    25  	FrontendAddress  string        `yaml:"frontend_address"`
    26  	SchedulerAddress string        `yaml:"scheduler_address"`
    27  	DNSLookupPeriod  time.Duration `yaml:"dns_lookup_duration"`
    28  
    29  	Parallelism           int  `yaml:"parallelism"`
    30  	MatchMaxConcurrency   bool `yaml:"match_max_concurrent"`
    31  	MaxConcurrentRequests int  `yaml:"-"` // Must be same as passed to LogQL Engine.
    32  
    33  	QuerierID string `yaml:"id"`
    34  
    35  	GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"`
    36  }
    37  
    38  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    39  	f.StringVar(&cfg.SchedulerAddress, "querier.scheduler-address", "", "Hostname (and port) of scheduler that querier will periodically resolve, connect to and receive queries from. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.")
    40  	f.StringVar(&cfg.FrontendAddress, "querier.frontend-address", "", "Address of query frontend service, in host:port format. If -querier.scheduler-address is set as well, querier will use scheduler instead. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.")
    41  
    42  	f.DurationVar(&cfg.DNSLookupPeriod, "querier.dns-lookup-period", 3*time.Second, "How often to query DNS for query-frontend or query-scheduler address. Also used to determine how often to poll the scheduler-ring for addresses if the scheduler-ring is configured.")
    43  
    44  	f.IntVar(&cfg.Parallelism, "querier.worker-parallelism", 10, "Number of simultaneous queries to process per query-frontend or query-scheduler.")
    45  	f.BoolVar(&cfg.MatchMaxConcurrency, "querier.worker-match-max-concurrent", true, "Force worker concurrency to match the -querier.max-concurrent option. Overrides querier.worker-parallelism.")
    46  	f.StringVar(&cfg.QuerierID, "querier.id", "", "Querier ID, sent to frontend service to identify requests from the same querier. Defaults to hostname.")
    47  
    48  	cfg.GRPCClientConfig.RegisterFlagsWithPrefix("querier.frontend-client", f)
    49  }
    50  
    51  func (cfg *Config) Validate(log log.Logger) error {
    52  	if cfg.FrontendAddress != "" && cfg.SchedulerAddress != "" {
    53  		return errors.New("frontend address and scheduler address are mutually exclusive, please use only one")
    54  	}
    55  	return cfg.GRPCClientConfig.Validate(log)
    56  }
    57  
    58  // Handler for HTTP requests wrapped in protobuf messages.
    59  type RequestHandler interface {
    60  	Handle(context.Context, *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error)
    61  }
    62  
    63  // Single processor handles all streaming operations to query-frontend or query-scheduler to fetch queries
    64  // and process them.
    65  type processor interface {
    66  	// Each invocation of processQueriesOnSingleStream starts new streaming operation to query-frontend
    67  	// or query-scheduler to fetch queries and execute them.
    68  	//
    69  	// This method must react on context being finished, and stop when that happens.
    70  	//
    71  	// processorManager (not processor) is responsible for starting as many goroutines as needed for each connection.
    72  	processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string)
    73  
    74  	// notifyShutdown notifies the remote query-frontend or query-scheduler that the querier is
    75  	// shutting down.
    76  	notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string)
    77  }
    78  
    79  type querierWorker struct {
    80  	*services.BasicService
    81  
    82  	cfg    Config
    83  	logger log.Logger
    84  
    85  	processor processor
    86  
    87  	subservices *services.Manager
    88  
    89  	mu sync.Mutex
    90  	// Set to nil when stop is called... no more managers are created afterwards.
    91  	managers map[string]*processorManager
    92  
    93  	metrics *Metrics
    94  }
    95  
    96  func NewQuerierWorker(cfg Config, rng ring.ReadRing, handler RequestHandler, logger log.Logger, reg prometheus.Registerer) (services.Service, error) {
    97  	if cfg.QuerierID == "" {
    98  		hostname, err := os.Hostname()
    99  		if err != nil {
   100  			return nil, errors.Wrap(err, "failed to get hostname for configuring querier ID")
   101  		}
   102  		cfg.QuerierID = hostname
   103  	}
   104  
   105  	metrics := NewMetrics(cfg, reg)
   106  	var processor processor
   107  	var servs []services.Service
   108  	var address string
   109  
   110  	switch {
   111  	case rng != nil:
   112  		level.Info(logger).Log("msg", "Starting querier worker using query-scheduler and scheduler ring for addresses")
   113  		processor, servs = newSchedulerProcessor(cfg, handler, logger, metrics)
   114  	case cfg.SchedulerAddress != "":
   115  		level.Info(logger).Log("msg", "Starting querier worker connected to query-scheduler", "scheduler", cfg.SchedulerAddress)
   116  
   117  		address = cfg.SchedulerAddress
   118  		processor, servs = newSchedulerProcessor(cfg, handler, logger, metrics)
   119  
   120  	case cfg.FrontendAddress != "":
   121  		level.Info(logger).Log("msg", "Starting querier worker connected to query-frontend", "frontend", cfg.FrontendAddress)
   122  
   123  		address = cfg.FrontendAddress
   124  		processor = newFrontendProcessor(cfg, handler, logger)
   125  	default:
   126  		return nil, errors.New("unable to start the querier worker, need to configure one of frontend_address, scheduler_address, or a ring config in the query_scheduler config block")
   127  	}
   128  
   129  	return newQuerierWorkerWithProcessor(cfg, metrics, logger, processor, address, rng, servs)
   130  }
   131  
   132  func newQuerierWorkerWithProcessor(cfg Config, metrics *Metrics, logger log.Logger, processor processor, address string, ring ring.ReadRing, servs []services.Service) (*querierWorker, error) {
   133  	f := &querierWorker{
   134  		cfg:       cfg,
   135  		logger:    logger,
   136  		managers:  map[string]*processorManager{},
   137  		processor: processor,
   138  		metrics:   metrics,
   139  	}
   140  
   141  	// Empty address is only used in tests, where individual targets are added manually.
   142  	if address != "" {
   143  		w, err := util.NewDNSWatcher(address, cfg.DNSLookupPeriod, f)
   144  		if err != nil {
   145  			return nil, err
   146  		}
   147  
   148  		servs = append(servs, w)
   149  	}
   150  
   151  	if ring != nil {
   152  		w, err := lokiutil.NewRingWatcher(log.With(logger, "component", "querier-scheduler-worker"), ring, cfg.DNSLookupPeriod, f)
   153  		if err != nil {
   154  			return nil, err
   155  		}
   156  		servs = append(servs, w)
   157  	}
   158  
   159  	if len(servs) > 0 {
   160  		subservices, err := services.NewManager(servs...)
   161  		if err != nil {
   162  			return nil, errors.Wrap(err, "querier worker subservices")
   163  		}
   164  
   165  		f.subservices = subservices
   166  	}
   167  
   168  	f.BasicService = services.NewIdleService(f.starting, f.stopping)
   169  	return f, nil
   170  }
   171  
   172  func (w *querierWorker) starting(ctx context.Context) error {
   173  	if w.subservices == nil {
   174  		return nil
   175  	}
   176  	return services.StartManagerAndAwaitHealthy(ctx, w.subservices)
   177  }
   178  
   179  func (w *querierWorker) stopping(_ error) error {
   180  	// Stop all goroutines fetching queries. Note that in Stopping state,
   181  	// worker no longer creates new managers in AddressAdded method.
   182  	w.mu.Lock()
   183  	for _, m := range w.managers {
   184  		m.stop()
   185  	}
   186  	w.mu.Unlock()
   187  
   188  	if w.subservices == nil {
   189  		return nil
   190  	}
   191  
   192  	// Stop DNS watcher and services used by processor.
   193  	return services.StopManagerAndAwaitStopped(context.Background(), w.subservices)
   194  }
   195  
   196  func (w *querierWorker) AddressAdded(address string) {
   197  	ctx := w.ServiceContext()
   198  	if ctx == nil || ctx.Err() != nil {
   199  		return
   200  	}
   201  
   202  	w.mu.Lock()
   203  	defer w.mu.Unlock()
   204  
   205  	if m := w.managers[address]; m != nil {
   206  		return
   207  	}
   208  
   209  	level.Info(w.logger).Log("msg", "adding connection", "addr", address)
   210  	conn, err := w.connect(context.Background(), address)
   211  	if err != nil {
   212  		level.Error(w.logger).Log("msg", "error connecting", "addr", address, "err", err)
   213  		return
   214  	}
   215  
   216  	w.managers[address] = newProcessorManager(ctx, w.processor, conn, address)
   217  	// Called with lock.
   218  	w.resetConcurrency()
   219  }
   220  
   221  func (w *querierWorker) AddressRemoved(address string) {
   222  	level.Info(w.logger).Log("msg", "removing connection", "addr", address)
   223  
   224  	w.mu.Lock()
   225  	p := w.managers[address]
   226  	delete(w.managers, address)
   227  	// Called with lock.
   228  	w.resetConcurrency()
   229  	w.mu.Unlock()
   230  
   231  	if p != nil {
   232  		p.stop()
   233  	}
   234  }
   235  
   236  // Must be called with lock.
   237  func (w *querierWorker) resetConcurrency() {
   238  	totalConcurrency := 0
   239  	defer func() {
   240  		w.metrics.concurrentWorkers.Set(float64(totalConcurrency))
   241  	}()
   242  	index := 0
   243  
   244  	for _, m := range w.managers {
   245  		concurrency := 0
   246  
   247  		if w.cfg.MatchMaxConcurrency {
   248  			concurrency = w.cfg.MaxConcurrentRequests / len(w.managers)
   249  
   250  			// If max concurrency does not evenly divide into our frontends a subset will be chosen
   251  			// to receive an extra connection.  Frontend addresses were shuffled above so this will be a
   252  			// random selection of frontends.
   253  			if index < w.cfg.MaxConcurrentRequests%len(w.managers) {
   254  				level.Warn(w.logger).Log("msg", "max concurrency is not evenly divisible across targets, adding an extra connection", "addr", m.address)
   255  				concurrency++
   256  			}
   257  		} else {
   258  			concurrency = w.cfg.Parallelism
   259  		}
   260  
   261  		// If concurrency is 0 then MaxConcurrentRequests is less than the total number of
   262  		// frontends/schedulers. In order to prevent accidentally starving a frontend or scheduler we are just going to
   263  		// always connect once to every target.  This is dangerous b/c we may start exceeding LogQL
   264  		// max concurrency.
   265  		if concurrency == 0 {
   266  			concurrency = 1
   267  		}
   268  
   269  		totalConcurrency += concurrency
   270  		m.concurrency(concurrency)
   271  		index++
   272  	}
   273  
   274  	if totalConcurrency > w.cfg.MaxConcurrentRequests {
   275  		level.Warn(w.logger).Log("msg", "total worker concurrency is greater than logql max concurrency. Queries may be queued in the querier which reduces QOS")
   276  	}
   277  }
   278  
   279  func (w *querierWorker) connect(ctx context.Context, address string) (*grpc.ClientConn, error) {
   280  	// Because we only use single long-running method, it doesn't make sense to inject user ID, send over tracing or add metrics.
   281  	opts, err := w.cfg.GRPCClientConfig.DialOption(nil, nil)
   282  	if err != nil {
   283  		return nil, err
   284  	}
   285  
   286  	conn, err := grpc.DialContext(ctx, address, opts...)
   287  	if err != nil {
   288  		return nil, err
   289  	}
   290  	return conn, nil
   291  }