github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/querier/worker/worker.go (about) 1 package worker 2 3 import ( 4 "context" 5 "flag" 6 "os" 7 "sync" 8 "time" 9 10 "github.com/go-kit/log" 11 "github.com/go-kit/log/level" 12 "github.com/grafana/dskit/grpcclient" 13 "github.com/grafana/dskit/ring" 14 "github.com/grafana/dskit/services" 15 "github.com/pkg/errors" 16 "github.com/prometheus/client_golang/prometheus" 17 "github.com/weaveworks/common/httpgrpc" 18 "google.golang.org/grpc" 19 20 "github.com/grafana/loki/pkg/util" 21 lokiutil "github.com/grafana/loki/pkg/util" 22 ) 23 24 type Config struct { 25 FrontendAddress string `yaml:"frontend_address"` 26 SchedulerAddress string `yaml:"scheduler_address"` 27 DNSLookupPeriod time.Duration `yaml:"dns_lookup_duration"` 28 29 Parallelism int `yaml:"parallelism"` 30 MatchMaxConcurrency bool `yaml:"match_max_concurrent"` 31 MaxConcurrentRequests int `yaml:"-"` // Must be same as passed to LogQL Engine. 32 33 QuerierID string `yaml:"id"` 34 35 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"` 36 } 37 38 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 39 f.StringVar(&cfg.SchedulerAddress, "querier.scheduler-address", "", "Hostname (and port) of scheduler that querier will periodically resolve, connect to and receive queries from. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.") 40 f.StringVar(&cfg.FrontendAddress, "querier.frontend-address", "", "Address of query frontend service, in host:port format. If -querier.scheduler-address is set as well, querier will use scheduler instead. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.") 41 42 f.DurationVar(&cfg.DNSLookupPeriod, "querier.dns-lookup-period", 3*time.Second, "How often to query DNS for query-frontend or query-scheduler address. Also used to determine how often to poll the scheduler-ring for addresses if the scheduler-ring is configured.") 43 44 f.IntVar(&cfg.Parallelism, "querier.worker-parallelism", 10, "Number of simultaneous queries to process per query-frontend or query-scheduler.") 45 f.BoolVar(&cfg.MatchMaxConcurrency, "querier.worker-match-max-concurrent", true, "Force worker concurrency to match the -querier.max-concurrent option. Overrides querier.worker-parallelism.") 46 f.StringVar(&cfg.QuerierID, "querier.id", "", "Querier ID, sent to frontend service to identify requests from the same querier. Defaults to hostname.") 47 48 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("querier.frontend-client", f) 49 } 50 51 func (cfg *Config) Validate(log log.Logger) error { 52 if cfg.FrontendAddress != "" && cfg.SchedulerAddress != "" { 53 return errors.New("frontend address and scheduler address are mutually exclusive, please use only one") 54 } 55 return cfg.GRPCClientConfig.Validate(log) 56 } 57 58 // Handler for HTTP requests wrapped in protobuf messages. 59 type RequestHandler interface { 60 Handle(context.Context, *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) 61 } 62 63 // Single processor handles all streaming operations to query-frontend or query-scheduler to fetch queries 64 // and process them. 65 type processor interface { 66 // Each invocation of processQueriesOnSingleStream starts new streaming operation to query-frontend 67 // or query-scheduler to fetch queries and execute them. 68 // 69 // This method must react on context being finished, and stop when that happens. 70 // 71 // processorManager (not processor) is responsible for starting as many goroutines as needed for each connection. 72 processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string) 73 74 // notifyShutdown notifies the remote query-frontend or query-scheduler that the querier is 75 // shutting down. 76 notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string) 77 } 78 79 type querierWorker struct { 80 *services.BasicService 81 82 cfg Config 83 logger log.Logger 84 85 processor processor 86 87 subservices *services.Manager 88 89 mu sync.Mutex 90 // Set to nil when stop is called... no more managers are created afterwards. 91 managers map[string]*processorManager 92 93 metrics *Metrics 94 } 95 96 func NewQuerierWorker(cfg Config, rng ring.ReadRing, handler RequestHandler, logger log.Logger, reg prometheus.Registerer) (services.Service, error) { 97 if cfg.QuerierID == "" { 98 hostname, err := os.Hostname() 99 if err != nil { 100 return nil, errors.Wrap(err, "failed to get hostname for configuring querier ID") 101 } 102 cfg.QuerierID = hostname 103 } 104 105 metrics := NewMetrics(cfg, reg) 106 var processor processor 107 var servs []services.Service 108 var address string 109 110 switch { 111 case rng != nil: 112 level.Info(logger).Log("msg", "Starting querier worker using query-scheduler and scheduler ring for addresses") 113 processor, servs = newSchedulerProcessor(cfg, handler, logger, metrics) 114 case cfg.SchedulerAddress != "": 115 level.Info(logger).Log("msg", "Starting querier worker connected to query-scheduler", "scheduler", cfg.SchedulerAddress) 116 117 address = cfg.SchedulerAddress 118 processor, servs = newSchedulerProcessor(cfg, handler, logger, metrics) 119 120 case cfg.FrontendAddress != "": 121 level.Info(logger).Log("msg", "Starting querier worker connected to query-frontend", "frontend", cfg.FrontendAddress) 122 123 address = cfg.FrontendAddress 124 processor = newFrontendProcessor(cfg, handler, logger) 125 default: 126 return nil, errors.New("unable to start the querier worker, need to configure one of frontend_address, scheduler_address, or a ring config in the query_scheduler config block") 127 } 128 129 return newQuerierWorkerWithProcessor(cfg, metrics, logger, processor, address, rng, servs) 130 } 131 132 func newQuerierWorkerWithProcessor(cfg Config, metrics *Metrics, logger log.Logger, processor processor, address string, ring ring.ReadRing, servs []services.Service) (*querierWorker, error) { 133 f := &querierWorker{ 134 cfg: cfg, 135 logger: logger, 136 managers: map[string]*processorManager{}, 137 processor: processor, 138 metrics: metrics, 139 } 140 141 // Empty address is only used in tests, where individual targets are added manually. 142 if address != "" { 143 w, err := util.NewDNSWatcher(address, cfg.DNSLookupPeriod, f) 144 if err != nil { 145 return nil, err 146 } 147 148 servs = append(servs, w) 149 } 150 151 if ring != nil { 152 w, err := lokiutil.NewRingWatcher(log.With(logger, "component", "querier-scheduler-worker"), ring, cfg.DNSLookupPeriod, f) 153 if err != nil { 154 return nil, err 155 } 156 servs = append(servs, w) 157 } 158 159 if len(servs) > 0 { 160 subservices, err := services.NewManager(servs...) 161 if err != nil { 162 return nil, errors.Wrap(err, "querier worker subservices") 163 } 164 165 f.subservices = subservices 166 } 167 168 f.BasicService = services.NewIdleService(f.starting, f.stopping) 169 return f, nil 170 } 171 172 func (w *querierWorker) starting(ctx context.Context) error { 173 if w.subservices == nil { 174 return nil 175 } 176 return services.StartManagerAndAwaitHealthy(ctx, w.subservices) 177 } 178 179 func (w *querierWorker) stopping(_ error) error { 180 // Stop all goroutines fetching queries. Note that in Stopping state, 181 // worker no longer creates new managers in AddressAdded method. 182 w.mu.Lock() 183 for _, m := range w.managers { 184 m.stop() 185 } 186 w.mu.Unlock() 187 188 if w.subservices == nil { 189 return nil 190 } 191 192 // Stop DNS watcher and services used by processor. 193 return services.StopManagerAndAwaitStopped(context.Background(), w.subservices) 194 } 195 196 func (w *querierWorker) AddressAdded(address string) { 197 ctx := w.ServiceContext() 198 if ctx == nil || ctx.Err() != nil { 199 return 200 } 201 202 w.mu.Lock() 203 defer w.mu.Unlock() 204 205 if m := w.managers[address]; m != nil { 206 return 207 } 208 209 level.Info(w.logger).Log("msg", "adding connection", "addr", address) 210 conn, err := w.connect(context.Background(), address) 211 if err != nil { 212 level.Error(w.logger).Log("msg", "error connecting", "addr", address, "err", err) 213 return 214 } 215 216 w.managers[address] = newProcessorManager(ctx, w.processor, conn, address) 217 // Called with lock. 218 w.resetConcurrency() 219 } 220 221 func (w *querierWorker) AddressRemoved(address string) { 222 level.Info(w.logger).Log("msg", "removing connection", "addr", address) 223 224 w.mu.Lock() 225 p := w.managers[address] 226 delete(w.managers, address) 227 // Called with lock. 228 w.resetConcurrency() 229 w.mu.Unlock() 230 231 if p != nil { 232 p.stop() 233 } 234 } 235 236 // Must be called with lock. 237 func (w *querierWorker) resetConcurrency() { 238 totalConcurrency := 0 239 defer func() { 240 w.metrics.concurrentWorkers.Set(float64(totalConcurrency)) 241 }() 242 index := 0 243 244 for _, m := range w.managers { 245 concurrency := 0 246 247 if w.cfg.MatchMaxConcurrency { 248 concurrency = w.cfg.MaxConcurrentRequests / len(w.managers) 249 250 // If max concurrency does not evenly divide into our frontends a subset will be chosen 251 // to receive an extra connection. Frontend addresses were shuffled above so this will be a 252 // random selection of frontends. 253 if index < w.cfg.MaxConcurrentRequests%len(w.managers) { 254 level.Warn(w.logger).Log("msg", "max concurrency is not evenly divisible across targets, adding an extra connection", "addr", m.address) 255 concurrency++ 256 } 257 } else { 258 concurrency = w.cfg.Parallelism 259 } 260 261 // If concurrency is 0 then MaxConcurrentRequests is less than the total number of 262 // frontends/schedulers. In order to prevent accidentally starving a frontend or scheduler we are just going to 263 // always connect once to every target. This is dangerous b/c we may start exceeding LogQL 264 // max concurrency. 265 if concurrency == 0 { 266 concurrency = 1 267 } 268 269 totalConcurrency += concurrency 270 m.concurrency(concurrency) 271 index++ 272 } 273 274 if totalConcurrency > w.cfg.MaxConcurrentRequests { 275 level.Warn(w.logger).Log("msg", "total worker concurrency is greater than logql max concurrency. Queries may be queued in the querier which reduces QOS") 276 } 277 } 278 279 func (w *querierWorker) connect(ctx context.Context, address string) (*grpc.ClientConn, error) { 280 // Because we only use single long-running method, it doesn't make sense to inject user ID, send over tracing or add metrics. 281 opts, err := w.cfg.GRPCClientConfig.DialOption(nil, nil) 282 if err != nil { 283 return nil, err 284 } 285 286 conn, err := grpc.DialContext(ctx, address, opts...) 287 if err != nil { 288 return nil, err 289 } 290 return conn, nil 291 }