github.com/grafana/pyroscope@v1.18.0/pkg/querier/worker/worker.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/querier/worker/worker.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 6 package worker 7 8 import ( 9 "context" 10 "flag" 11 "fmt" 12 "os" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/grafana/dskit/grpcclient" 19 "github.com/grafana/dskit/services" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "google.golang.org/grpc" 23 24 "github.com/grafana/pyroscope/pkg/scheduler/schedulerdiscovery" 25 "github.com/grafana/pyroscope/pkg/util/httpgrpc" 26 "github.com/grafana/pyroscope/pkg/util/servicediscovery" 27 ) 28 29 type Config struct { 30 SchedulerAddress string `yaml:"scheduler_address" doc:"hidden"` 31 DNSLookupPeriod time.Duration `yaml:"dns_lookup_duration" category:"advanced" doc:"hidden"` 32 QuerierID string `yaml:"id" category:"advanced"` 33 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the queriers and the query-frontends / query-schedulers."` 34 35 MaxConcurrent int `yaml:"max_concurrent" category:"advanced"` 36 37 // This configuration is injected internally. 38 QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"` 39 MaxLoopDuration time.Duration `yaml:"-"` 40 } 41 42 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 43 f.StringVar(&cfg.QuerierID, "querier.id", "", "Querier ID, sent to the query-frontend to identify requests from the same querier. Defaults to hostname.") 44 f.IntVar(&cfg.MaxConcurrent, "querier.max-concurrent", 4, "The maximum number of concurrent queries allowed.") 45 46 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("querier.frontend-client", f) 47 } 48 49 func (cfg *Config) Validate(log log.Logger) error { 50 if cfg.QuerySchedulerDiscovery.Mode == schedulerdiscovery.ModeRing && cfg.SchedulerAddress != "" { 51 return fmt.Errorf("scheduler address cannot be specified when query-scheduler service discovery mode is set to '%s'", cfg.QuerySchedulerDiscovery.Mode) 52 } 53 54 return cfg.GRPCClientConfig.Validate() 55 } 56 57 // RequestHandler for HTTP requests wrapped in protobuf messages. 58 type RequestHandler interface { 59 Handle(context.Context, *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) 60 } 61 62 // Single processor handles all streaming operations to query-frontend or query-scheduler to fetch queries 63 // and process them. 64 type processor interface { 65 // Each invocation of processQueriesOnSingleStream starts new streaming operation to query-frontend 66 // or query-scheduler to fetch queries and execute them. 67 // 68 // This method must react on context being finished, and stop when that happens. 69 // 70 // processorManager (not processor) is responsible for starting as many goroutines as needed for each connection. 71 processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string) 72 73 // notifyShutdown notifies the remote query-frontend or query-scheduler that the querier is 74 // shutting down. 75 notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string) 76 } 77 78 // serviceDiscoveryFactory makes a new service discovery instance. 79 type serviceDiscoveryFactory func(receiver servicediscovery.Notifications) (services.Service, error) 80 81 type querierWorker struct { 82 *services.BasicService 83 84 cfg Config 85 log log.Logger 86 87 processor processor 88 89 // Subservices manager. 90 subservices *services.Manager 91 subservicesWatcher *services.FailureWatcher 92 93 mu sync.Mutex 94 managers map[string]*processorManager 95 instances map[string]servicediscovery.Instance 96 } 97 98 func NewQuerierWorker(cfg Config, handler RequestHandler, log log.Logger, reg prometheus.Registerer) (services.Service, error) { 99 if cfg.QuerierID == "" { 100 hostname, err := os.Hostname() 101 if err != nil { 102 return nil, errors.Wrap(err, "failed to get hostname for configuring querier ID") 103 } 104 cfg.QuerierID = hostname 105 } 106 107 var processor processor 108 var servs []services.Service 109 var factory serviceDiscoveryFactory 110 111 switch { 112 case cfg.SchedulerAddress != "" || cfg.QuerySchedulerDiscovery.Mode == schedulerdiscovery.ModeRing: 113 level.Info(log).Log("msg", "Starting querier worker connected to query-scheduler", "scheduler", cfg.SchedulerAddress) 114 115 factory = func(receiver servicediscovery.Notifications) (services.Service, error) { 116 return schedulerdiscovery.New(cfg.QuerySchedulerDiscovery, cfg.SchedulerAddress, cfg.DNSLookupPeriod, "querier", receiver, log, reg) 117 } 118 119 processor, servs = newSchedulerProcessor(cfg, handler, log, reg) 120 121 default: 122 return nil, errors.New("no query-scheduler or query-frontend address") 123 } 124 125 return newQuerierWorkerWithProcessor(cfg, log, processor, factory, servs) 126 } 127 128 func newQuerierWorkerWithProcessor(cfg Config, log log.Logger, processor processor, newServiceDiscovery serviceDiscoveryFactory, servs []services.Service) (*querierWorker, error) { 129 f := &querierWorker{ 130 cfg: cfg, 131 log: log, 132 managers: map[string]*processorManager{}, 133 instances: map[string]servicediscovery.Instance{}, 134 processor: processor, 135 } 136 137 // There's no service discovery in some tests. 138 if newServiceDiscovery != nil { 139 w, err := newServiceDiscovery(f) 140 if err != nil { 141 return nil, err 142 } 143 144 servs = append(servs, w) 145 } 146 147 if len(servs) > 0 { 148 subservices, err := services.NewManager(servs...) 149 if err != nil { 150 return nil, errors.Wrap(err, "querier worker subservices") 151 } 152 153 f.subservices = subservices 154 f.subservicesWatcher = services.NewFailureWatcher() 155 } 156 157 f.BasicService = services.NewBasicService(f.starting, f.running, f.stopping) 158 return f, nil 159 } 160 161 func (w *querierWorker) starting(ctx context.Context) error { 162 if w.subservices == nil { 163 return nil 164 } 165 166 w.subservicesWatcher.WatchManager(w.subservices) 167 return services.StartManagerAndAwaitHealthy(ctx, w.subservices) 168 } 169 170 func (w *querierWorker) running(ctx context.Context) error { 171 select { 172 case <-ctx.Done(): 173 return nil 174 case err := <-w.subservicesWatcher.Chan(): // The channel will be nil if w.subservicesWatcher is not set. 175 return errors.Wrap(err, "querier worker subservice failed") 176 } 177 } 178 179 func (w *querierWorker) stopping(_ error) error { 180 // Stop all goroutines fetching queries. Note that in Stopping state, 181 // worker no longer creates new managers in InstanceAdded method. 182 w.mu.Lock() 183 for address, m := range w.managers { 184 m.stop() 185 186 delete(w.managers, address) 187 delete(w.instances, address) 188 } 189 w.mu.Unlock() 190 191 if w.subservices == nil { 192 return nil 193 } 194 195 // Stop service discovery and services used by processor. 196 return services.StopManagerAndAwaitStopped(context.Background(), w.subservices) 197 } 198 199 func (w *querierWorker) InstanceAdded(instance servicediscovery.Instance) { 200 w.mu.Lock() 201 defer w.mu.Unlock() 202 203 // Ensure the querier worker hasn't been stopped (or is stopping). 204 // This check is done inside the lock, to avoid any race condition with the stopping() function. 205 ctx := w.ServiceContext() 206 if ctx == nil || ctx.Err() != nil { 207 return 208 } 209 210 address := instance.Address 211 if m := w.managers[address]; m != nil { 212 return 213 } 214 215 level.Info(w.log).Log("msg", "adding connection", "addr", address, "in-use", instance.InUse) 216 conn, err := w.connect(context.Background(), address) 217 if err != nil { 218 level.Error(w.log).Log("msg", "error connecting", "addr", address, "err", err) 219 return 220 } 221 222 w.managers[address] = newProcessorManager(ctx, w.processor, conn, address) 223 w.instances[address] = instance 224 225 // Called with lock. 226 w.resetConcurrency() 227 } 228 229 func (w *querierWorker) InstanceRemoved(instance servicediscovery.Instance) { 230 address := instance.Address 231 232 level.Info(w.log).Log("msg", "removing connection", "addr", address, "in-use", instance.InUse) 233 234 w.mu.Lock() 235 p := w.managers[address] 236 delete(w.managers, address) 237 delete(w.instances, address) 238 w.mu.Unlock() 239 240 if p != nil { 241 p.stop() 242 } 243 244 // Re-balance the connections between the available query-frontends / query-schedulers. 245 w.mu.Lock() 246 w.resetConcurrency() 247 w.mu.Unlock() 248 } 249 250 func (w *querierWorker) InstanceChanged(instance servicediscovery.Instance) { 251 w.mu.Lock() 252 defer w.mu.Unlock() 253 254 // Ensure the querier worker hasn't been stopped (or is stopping). 255 // This check is done inside the lock, to avoid any race condition with the stopping() function. 256 ctx := w.ServiceContext() 257 if ctx == nil || ctx.Err() != nil { 258 return 259 } 260 261 // Ensure there's a manager for the instance. If there's no, then it's a bug. 262 if m := w.managers[instance.Address]; m == nil { 263 level.Error(w.log).Log("msg", "received a notification about an unknown backend instance", "addr", instance.Address, "in-use", instance.InUse) 264 return 265 } 266 267 level.Info(w.log).Log("msg", "updating connection", "addr", instance.Address, "in-use", instance.InUse) 268 269 // Update instance and adjust concurrency. 270 w.instances[instance.Address] = instance 271 272 // Called with lock. 273 w.resetConcurrency() 274 } 275 276 // Must be called with lock. 277 func (w *querierWorker) resetConcurrency() { 278 desiredConcurrency := w.getDesiredConcurrency() 279 280 for _, m := range w.managers { 281 concurrency, ok := desiredConcurrency[m.address] 282 if !ok { 283 // This error should never happen. If it does, it means there's a bug in the code. 284 level.Error(w.log).Log("msg", "a querier worker is connected to an unknown remote endpoint", "addr", m.address) 285 286 // Consider it as not in-use. 287 concurrency = 1 288 } 289 290 m.concurrency(concurrency) 291 } 292 } 293 294 // getDesiredConcurrency returns the number of desired connections for each discovered query-frontend / query-scheduler instance. 295 // Must be called with lock. 296 func (w *querierWorker) getDesiredConcurrency() map[string]int { 297 // Count the number of in-use instances. 298 numInUse := 0 299 for _, instance := range w.instances { 300 if instance.InUse { 301 numInUse++ 302 } 303 } 304 305 var ( 306 desired = make(map[string]int, len(w.instances)) 307 inUseIndex = 0 308 ) 309 310 // Compute the number of desired connections for each discovered instance. 311 for address, instance := range w.instances { 312 // Run only 1 worker for each instance not in-use, to allow for the queues 313 // to be drained when the in-use instances change or if, for any reason, 314 // queries are enqueued on the ones not in-use. 315 if !instance.InUse { 316 desired[address] = 1 317 continue 318 } 319 320 concurrency := w.cfg.MaxConcurrent / numInUse 321 322 // If max concurrency does not evenly divide into in-use instances, then a subset will be chosen 323 // to receive an extra connection. Since we're iterating a map (whose iteration order is not guaranteed), 324 // then this should practically select a random address for the extra connection. 325 if inUseIndex < w.cfg.MaxConcurrent%numInUse { 326 level.Warn(w.log).Log("msg", "max concurrency is not evenly divisible across targets, adding an extra connection", "addr", address) 327 concurrency++ 328 } 329 330 // If concurrency is 0 then MaxConcurrentRequests is less than the total number of 331 // frontends/schedulers. In order to prevent accidentally starving a frontend or scheduler we are just going to 332 // always connect once to every target. 333 if concurrency == 0 { 334 concurrency = 1 335 } 336 337 desired[address] = concurrency 338 inUseIndex++ 339 } 340 341 return desired 342 } 343 344 func (w *querierWorker) connect(ctx context.Context, address string) (*grpc.ClientConn, error) { 345 // Because we only use single long-running method, it doesn't make sense to inject user ID, send over tracing or add metrics. 346 opts, err := w.cfg.GRPCClientConfig.DialOption(nil, nil, nil) 347 if err != nil { 348 return nil, err 349 } 350 351 conn, err := grpc.DialContext(ctx, address, opts...) 352 if err != nil { 353 return nil, err 354 } 355 return conn, nil 356 }