github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/querier/worker/worker.go (about) 1 package worker 2 3 import ( 4 "context" 5 "flag" 6 "os" 7 "sync" 8 "time" 9 10 "github.com/go-kit/log" 11 "github.com/go-kit/log/level" 12 "github.com/grafana/dskit/grpcclient" 13 "github.com/grafana/dskit/services" 14 "github.com/pkg/errors" 15 "github.com/prometheus/client_golang/prometheus" 16 "github.com/weaveworks/common/httpgrpc" 17 "google.golang.org/grpc" 18 19 "github.com/cortexproject/cortex/pkg/util" 20 ) 21 22 type Config struct { 23 FrontendAddress string `yaml:"frontend_address"` 24 SchedulerAddress string `yaml:"scheduler_address"` 25 DNSLookupPeriod time.Duration `yaml:"dns_lookup_duration"` 26 27 Parallelism int `yaml:"parallelism"` 28 MatchMaxConcurrency bool `yaml:"match_max_concurrent"` 29 MaxConcurrentRequests int `yaml:"-"` // Must be same as passed to PromQL Engine. 30 31 QuerierID string `yaml:"id"` 32 33 GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config"` 34 } 35 36 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 37 f.StringVar(&cfg.SchedulerAddress, "querier.scheduler-address", "", "Hostname (and port) of scheduler that querier will periodically resolve, connect to and receive queries from. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.") 38 f.StringVar(&cfg.FrontendAddress, "querier.frontend-address", "", "Address of query frontend service, in host:port format. If -querier.scheduler-address is set as well, querier will use scheduler instead. Only one of -querier.frontend-address or -querier.scheduler-address can be set. If neither is set, queries are only received via HTTP endpoint.") 39 40 f.DurationVar(&cfg.DNSLookupPeriod, "querier.dns-lookup-period", 10*time.Second, "How often to query DNS for query-frontend or query-scheduler address.") 41 42 f.IntVar(&cfg.Parallelism, "querier.worker-parallelism", 10, "Number of simultaneous queries to process per query-frontend or query-scheduler.") 43 f.BoolVar(&cfg.MatchMaxConcurrency, "querier.worker-match-max-concurrent", false, "Force worker concurrency to match the -querier.max-concurrent option. Overrides querier.worker-parallelism.") 44 f.StringVar(&cfg.QuerierID, "querier.id", "", "Querier ID, sent to frontend service to identify requests from the same querier. Defaults to hostname.") 45 46 cfg.GRPCClientConfig.RegisterFlagsWithPrefix("querier.frontend-client", f) 47 } 48 49 func (cfg *Config) Validate(log log.Logger) error { 50 if cfg.FrontendAddress != "" && cfg.SchedulerAddress != "" { 51 return errors.New("frontend address and scheduler address are mutually exclusive, please use only one") 52 } 53 return cfg.GRPCClientConfig.Validate(log) 54 } 55 56 // Handler for HTTP requests wrapped in protobuf messages. 57 type RequestHandler interface { 58 Handle(context.Context, *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) 59 } 60 61 // Single processor handles all streaming operations to query-frontend or query-scheduler to fetch queries 62 // and process them. 63 type processor interface { 64 // Each invocation of processQueriesOnSingleStream starts new streaming operation to query-frontend 65 // or query-scheduler to fetch queries and execute them. 66 // 67 // This method must react on context being finished, and stop when that happens. 68 // 69 // processorManager (not processor) is responsible for starting as many goroutines as needed for each connection. 70 processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string) 71 72 // notifyShutdown notifies the remote query-frontend or query-scheduler that the querier is 73 // shutting down. 74 notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string) 75 } 76 77 type querierWorker struct { 78 *services.BasicService 79 80 cfg Config 81 log log.Logger 82 83 processor processor 84 85 subservices *services.Manager 86 87 mu sync.Mutex 88 // Set to nil when stop is called... no more managers are created afterwards. 89 managers map[string]*processorManager 90 } 91 92 func NewQuerierWorker(cfg Config, handler RequestHandler, log log.Logger, reg prometheus.Registerer) (services.Service, error) { 93 if cfg.QuerierID == "" { 94 hostname, err := os.Hostname() 95 if err != nil { 96 return nil, errors.Wrap(err, "failed to get hostname for configuring querier ID") 97 } 98 cfg.QuerierID = hostname 99 } 100 101 var processor processor 102 var servs []services.Service 103 var address string 104 105 switch { 106 case cfg.SchedulerAddress != "": 107 level.Info(log).Log("msg", "Starting querier worker connected to query-scheduler", "scheduler", cfg.SchedulerAddress) 108 109 address = cfg.SchedulerAddress 110 processor, servs = newSchedulerProcessor(cfg, handler, log, reg) 111 112 case cfg.FrontendAddress != "": 113 level.Info(log).Log("msg", "Starting querier worker connected to query-frontend", "frontend", cfg.FrontendAddress) 114 115 address = cfg.FrontendAddress 116 processor = newFrontendProcessor(cfg, handler, log) 117 118 default: 119 return nil, errors.New("no query-scheduler or query-frontend address") 120 } 121 122 return newQuerierWorkerWithProcessor(cfg, log, processor, address, servs) 123 } 124 125 func newQuerierWorkerWithProcessor(cfg Config, log log.Logger, processor processor, address string, servs []services.Service) (*querierWorker, error) { 126 f := &querierWorker{ 127 cfg: cfg, 128 log: log, 129 managers: map[string]*processorManager{}, 130 processor: processor, 131 } 132 133 // Empty address is only used in tests, where individual targets are added manually. 134 if address != "" { 135 w, err := util.NewDNSWatcher(address, cfg.DNSLookupPeriod, f) 136 if err != nil { 137 return nil, err 138 } 139 140 servs = append(servs, w) 141 } 142 143 if len(servs) > 0 { 144 subservices, err := services.NewManager(servs...) 145 if err != nil { 146 return nil, errors.Wrap(err, "querier worker subservices") 147 } 148 149 f.subservices = subservices 150 } 151 152 f.BasicService = services.NewIdleService(f.starting, f.stopping) 153 return f, nil 154 } 155 156 func (w *querierWorker) starting(ctx context.Context) error { 157 if w.subservices == nil { 158 return nil 159 } 160 return services.StartManagerAndAwaitHealthy(ctx, w.subservices) 161 } 162 163 func (w *querierWorker) stopping(_ error) error { 164 // Stop all goroutines fetching queries. Note that in Stopping state, 165 // worker no longer creates new managers in AddressAdded method. 166 w.mu.Lock() 167 for _, m := range w.managers { 168 m.stop() 169 } 170 w.mu.Unlock() 171 172 if w.subservices == nil { 173 return nil 174 } 175 176 // Stop DNS watcher and services used by processor. 177 return services.StopManagerAndAwaitStopped(context.Background(), w.subservices) 178 } 179 180 func (w *querierWorker) AddressAdded(address string) { 181 ctx := w.ServiceContext() 182 if ctx == nil || ctx.Err() != nil { 183 return 184 } 185 186 w.mu.Lock() 187 defer w.mu.Unlock() 188 189 if m := w.managers[address]; m != nil { 190 return 191 } 192 193 level.Info(w.log).Log("msg", "adding connection", "addr", address) 194 conn, err := w.connect(context.Background(), address) 195 if err != nil { 196 level.Error(w.log).Log("msg", "error connecting", "addr", address, "err", err) 197 return 198 } 199 200 w.managers[address] = newProcessorManager(ctx, w.processor, conn, address) 201 // Called with lock. 202 w.resetConcurrency() 203 } 204 205 func (w *querierWorker) AddressRemoved(address string) { 206 level.Info(w.log).Log("msg", "removing connection", "addr", address) 207 208 w.mu.Lock() 209 p := w.managers[address] 210 delete(w.managers, address) 211 // Called with lock. 212 w.resetConcurrency() 213 w.mu.Unlock() 214 215 if p != nil { 216 p.stop() 217 } 218 } 219 220 // Must be called with lock. 221 func (w *querierWorker) resetConcurrency() { 222 totalConcurrency := 0 223 index := 0 224 225 for _, m := range w.managers { 226 concurrency := 0 227 228 if w.cfg.MatchMaxConcurrency { 229 concurrency = w.cfg.MaxConcurrentRequests / len(w.managers) 230 231 // If max concurrency does not evenly divide into our frontends a subset will be chosen 232 // to receive an extra connection. Frontend addresses were shuffled above so this will be a 233 // random selection of frontends. 234 if index < w.cfg.MaxConcurrentRequests%len(w.managers) { 235 level.Warn(w.log).Log("msg", "max concurrency is not evenly divisible across targets, adding an extra connection", "addr", m.address) 236 concurrency++ 237 } 238 } else { 239 concurrency = w.cfg.Parallelism 240 } 241 242 // If concurrency is 0 then MaxConcurrentRequests is less than the total number of 243 // frontends/schedulers. In order to prevent accidentally starving a frontend or scheduler we are just going to 244 // always connect once to every target. This is dangerous b/c we may start exceeding PromQL 245 // max concurrency. 246 if concurrency == 0 { 247 concurrency = 1 248 } 249 250 totalConcurrency += concurrency 251 m.concurrency(concurrency) 252 index++ 253 } 254 255 if totalConcurrency > w.cfg.MaxConcurrentRequests { 256 level.Warn(w.log).Log("msg", "total worker concurrency is greater than promql max concurrency. Queries may be queued in the querier which reduces QOS") 257 } 258 } 259 260 func (w *querierWorker) connect(ctx context.Context, address string) (*grpc.ClientConn, error) { 261 // Because we only use single long-running method, it doesn't make sense to inject user ID, send over tracing or add metrics. 262 opts, err := w.cfg.GRPCClientConfig.DialOption(nil, nil) 263 if err != nil { 264 return nil, err 265 } 266 267 conn, err := grpc.DialContext(ctx, address, opts...) 268 if err != nil { 269 return nil, err 270 } 271 return conn, nil 272 }