github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/scrape/discovery/manager.go (about) 1 // Copyright 2013 The Prometheus Authors 2 // Copyright 2021 The Pyroscope Authors 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package discovery 17 18 import ( 19 "context" 20 "fmt" 21 "reflect" 22 "sync" 23 "time" 24 25 "github.com/prometheus/client_golang/prometheus" 26 "github.com/sirupsen/logrus" 27 28 "github.com/pyroscope-io/pyroscope/pkg/scrape/discovery/targetgroup" 29 ) 30 31 var ( 32 failedConfigs = prometheus.NewGaugeVec( 33 prometheus.GaugeOpts{ 34 Name: "pyroscope_sd_failed_configs", 35 Help: "Current number of service discovery configurations that failed to load.", 36 }, 37 []string{"name"}, 38 ) 39 discoveredTargets = prometheus.NewGaugeVec( 40 prometheus.GaugeOpts{ 41 Name: "pyroscope_sd_discovered_targets", 42 Help: "Current number of discovered targets.", 43 }, 44 []string{"name", "config"}, 45 ) 46 receivedUpdates = prometheus.NewCounterVec( 47 prometheus.CounterOpts{ 48 Name: "pyroscope_sd_received_updates_total", 49 Help: "Total number of update events received from the SD providers.", 50 }, 51 []string{"name"}, 52 ) 53 delayedUpdates = prometheus.NewCounterVec( 54 prometheus.CounterOpts{ 55 Name: "pyroscope_sd_updates_delayed_total", 56 Help: "Total number of update events that couldn't be sent immediately.", 57 }, 58 []string{"name"}, 59 ) 60 sentUpdates = prometheus.NewCounterVec( 61 prometheus.CounterOpts{ 62 Name: "pyroscope_sd_updates_total", 63 Help: "Total number of update events sent to the SD consumers.", 64 }, 65 []string{"name"}, 66 ) 67 ) 68 69 func RegisterMetrics() { 70 prometheus.MustRegister(failedConfigs, discoveredTargets, receivedUpdates, delayedUpdates, sentUpdates) 71 } 72 73 type poolKey struct { 74 setName string 75 provider string 76 } 77 78 // provider holds a Discoverer instance, its configuration, cancel func and its subscribers. 79 type provider struct { 80 name string 81 d Discoverer 82 config interface{} 83 84 cancel context.CancelFunc 85 // done should be called after cleaning up resources associated with cancelled provider. 86 done func() 87 88 mu sync.RWMutex 89 subs map[string]struct{} 90 91 // newSubs is used to temporary store subs to be used upon config reload completion. 92 newSubs map[string]struct{} 93 } 94 95 // IsStarted return true if Discoverer is started. 96 func (p *provider) IsStarted() bool { 97 return p.cancel != nil 98 } 99 100 // NewManager is the Discovery Manager constructor. 101 func NewManager(logger logrus.FieldLogger, options ...func(*Manager)) *Manager { 102 mgr := &Manager{ 103 logger: logger, 104 syncCh: make(chan map[string][]*targetgroup.Group), 105 targets: make(map[poolKey]map[string]*targetgroup.Group), 106 updatert: 5 * time.Second, 107 triggerSend: make(chan struct{}, 1), 108 } 109 mgr.ctx, mgr.cancel = context.WithCancel(context.Background()) 110 for _, option := range options { 111 option(mgr) 112 } 113 return mgr 114 } 115 116 // Name sets the name of the manager. 117 func Name(n string) func(*Manager) { 118 return func(m *Manager) { 119 m.mtx.Lock() 120 defer m.mtx.Unlock() 121 m.name = n 122 } 123 } 124 125 // Manager maintains a set of discovery providers and sends each update to a map channel. 126 // Targets are grouped by the target set name. 127 type Manager struct { 128 logger logrus.FieldLogger 129 name string 130 mtx sync.RWMutex 131 ctx context.Context 132 cancel context.CancelFunc 133 134 // Some Discoverers(e.g. k8s) send only the updates for a given target group, 135 // so we use map[tg.Source]*targetgroup.Group to know which group to update. 136 targets map[poolKey]map[string]*targetgroup.Group 137 targetsMtx sync.Mutex 138 139 // providers keeps track of SD providers. 140 providers []*provider 141 // The sync channel sends the updates as a map where the key is the job value from the scrape config. 142 syncCh chan map[string][]*targetgroup.Group 143 144 // How long to wait before sending updates to the channel. The variable 145 // should only be modified in unit tests. 146 updatert time.Duration 147 148 // The triggerSend channel signals to the Manager that new updates have been received from providers. 149 triggerSend chan struct{} 150 151 // lastProvider counts providers registered during Manager's lifetime. 152 lastProvider uint 153 } 154 155 // Run starts the background processing. 156 func (m *Manager) Run() error { 157 go m.sender() 158 for range m.ctx.Done() { 159 m.cancelDiscoverers() 160 return m.ctx.Err() 161 } 162 return nil 163 } 164 165 func (m *Manager) Stop() { 166 if m.cancel != nil { 167 m.cancel() 168 } 169 } 170 171 // SyncCh returns a read only channel used by all the clients to receive target updates. 172 func (m *Manager) SyncCh() <-chan map[string][]*targetgroup.Group { 173 return m.syncCh 174 } 175 176 // ApplyConfig checks if discovery provider with supplied config is already running and keeps them as is. 177 // Remaining providers are then stopped and new required providers are started using the provided config. 178 func (m *Manager) ApplyConfig(cfg map[string]Configs) error { 179 m.mtx.Lock() 180 defer m.mtx.Unlock() 181 182 var failedCount int 183 for name, scfg := range cfg { 184 failedCount += m.registerProviders(scfg, name) 185 } 186 failedConfigs.WithLabelValues(m.name).Set(float64(failedCount)) 187 188 var ( 189 wg sync.WaitGroup 190 // keep shows if we keep any providers after reload. 191 keep bool 192 newProviders []*provider 193 ) 194 for _, prov := range m.providers { 195 // Cancel obsolete providers. 196 if len(prov.newSubs) == 0 { 197 wg.Add(1) 198 prov.done = func() { 199 wg.Done() 200 } 201 prov.cancel() 202 continue 203 } 204 newProviders = append(newProviders, prov) 205 // refTargets keeps reference targets used to populate new subs' targets 206 var refTargets map[string]*targetgroup.Group 207 prov.mu.Lock() 208 for s := range prov.subs { 209 keep = true 210 refTargets = m.targets[poolKey{s, prov.name}] 211 // Remove obsolete subs' targets. 212 if _, ok := prov.newSubs[s]; !ok { 213 m.targetsMtx.Lock() 214 delete(m.targets, poolKey{s, prov.name}) 215 m.targetsMtx.Unlock() 216 discoveredTargets.DeleteLabelValues(m.name, s) 217 } 218 } 219 // Set metrics and targets for new subs. 220 for s := range prov.newSubs { 221 if _, ok := prov.subs[s]; !ok { 222 discoveredTargets.WithLabelValues(m.name, s).Set(0) 223 } 224 if l := len(refTargets); l > 0 { 225 m.targets[poolKey{s, prov.name}] = make(map[string]*targetgroup.Group, l) 226 for k, v := range refTargets { 227 m.targets[poolKey{s, prov.name}][k] = v 228 } 229 } 230 } 231 prov.subs = prov.newSubs 232 prov.newSubs = map[string]struct{}{} 233 prov.mu.Unlock() 234 if !prov.IsStarted() { 235 m.startProvider(m.ctx, prov) 236 } 237 } 238 // Currently downstream managers expect full target state upon config reload, so we must oblige. 239 // While startProvider does pull the trigger, it may take some time to do so, therefore 240 // we pull the trigger as soon as possible so that downstream managers can populate their state. 241 // See https://github.com/prometheus/prometheus/pull/8639 for details. 242 if keep { 243 select { 244 case m.triggerSend <- struct{}{}: 245 default: 246 } 247 } 248 m.providers = newProviders 249 wg.Wait() 250 251 return nil 252 } 253 254 // StartCustomProvider is used for sdtool. Only use this if you know what you're doing. 255 func (m *Manager) StartCustomProvider(ctx context.Context, name string, worker Discoverer) { 256 p := &provider{ 257 name: name, 258 d: worker, 259 subs: map[string]struct{}{ 260 name: {}, 261 }, 262 } 263 m.providers = append(m.providers, p) 264 m.startProvider(ctx, p) 265 } 266 267 func (m *Manager) startProvider(ctx context.Context, p *provider) { 268 m.logger. 269 WithField("provider", p.name). 270 WithField("subs", fmt.Sprintf("%v", p.subs)). 271 Debug("starting provider") 272 ctx, cancel := context.WithCancel(ctx) 273 updates := make(chan []*targetgroup.Group) 274 275 p.cancel = cancel 276 277 go p.d.Run(ctx, updates) 278 go m.updater(ctx, p, updates) 279 } 280 281 // cleaner cleans resources associated with provider. 282 func (m *Manager) cleaner(p *provider) { 283 m.targetsMtx.Lock() 284 p.mu.RLock() 285 for s := range p.subs { 286 delete(m.targets, poolKey{s, p.name}) 287 } 288 p.mu.RUnlock() 289 m.targetsMtx.Unlock() 290 if p.done != nil { 291 p.done() 292 } 293 } 294 295 func (m *Manager) updater(ctx context.Context, p *provider, updates chan []*targetgroup.Group) { 296 // Ensure targets from this provider are cleaned up. 297 defer m.cleaner(p) 298 for { 299 select { 300 case <-ctx.Done(): 301 return 302 case tgs, ok := <-updates: 303 receivedUpdates.WithLabelValues(m.name).Inc() 304 if !ok { 305 m.logger.WithField("provider", p.name).Debug("discoverer channel closed") 306 // Wait for provider cancellation to ensure targets are cleaned up when expected. 307 <-ctx.Done() 308 return 309 } 310 311 p.mu.RLock() 312 for s := range p.subs { 313 m.updateGroup(poolKey{setName: s, provider: p.name}, tgs) 314 } 315 p.mu.RUnlock() 316 317 select { 318 case m.triggerSend <- struct{}{}: 319 default: 320 } 321 } 322 } 323 } 324 325 func (m *Manager) sender() { 326 ticker := time.NewTicker(m.updatert) 327 defer ticker.Stop() 328 329 for { 330 select { 331 case <-m.ctx.Done(): 332 return 333 case <-ticker.C: // Some discoverers send updates too often, so we throttle these with the ticker. 334 select { 335 case <-m.triggerSend: 336 sentUpdates.WithLabelValues(m.name).Inc() 337 select { 338 case m.syncCh <- m.allGroups(): 339 default: 340 delayedUpdates.WithLabelValues(m.name).Inc() 341 m.logger.Debugf("discovery receiver's channel was full so will retry the next cycle") 342 select { 343 case m.triggerSend <- struct{}{}: 344 default: 345 } 346 } 347 default: 348 } 349 } 350 } 351 } 352 353 func (m *Manager) cancelDiscoverers() { 354 m.mtx.RLock() 355 defer m.mtx.RUnlock() 356 for _, p := range m.providers { 357 if p.cancel != nil { 358 p.cancel() 359 } 360 } 361 } 362 363 func (m *Manager) updateGroup(poolKey poolKey, tgs []*targetgroup.Group) { 364 m.targetsMtx.Lock() 365 defer m.targetsMtx.Unlock() 366 367 if _, ok := m.targets[poolKey]; !ok { 368 m.targets[poolKey] = make(map[string]*targetgroup.Group) 369 } 370 for _, tg := range tgs { 371 if tg != nil { // Some Discoverers send nil target group so need to check for it to avoid panics. 372 m.targets[poolKey][tg.Source] = tg 373 } 374 } 375 } 376 377 func (m *Manager) allGroups() map[string][]*targetgroup.Group { 378 tSets := map[string][]*targetgroup.Group{} 379 n := map[string]int{} 380 381 m.targetsMtx.Lock() 382 defer m.targetsMtx.Unlock() 383 for pkey, tsets := range m.targets { 384 for _, tg := range tsets { 385 // Even if the target group 'tg' is empty we still need to send it to the 'Scrape manager' 386 // to signal that it needs to stop all scrape loops for this target set. 387 tSets[pkey.setName] = append(tSets[pkey.setName], tg) 388 n[pkey.setName] += len(tg.Targets) 389 } 390 } 391 for setName, v := range n { 392 discoveredTargets.WithLabelValues(m.name, setName).Set(float64(v)) 393 } 394 return tSets 395 } 396 397 // registerProviders returns a number of failed SD config. 398 func (m *Manager) registerProviders(cfgs Configs, setName string) int { 399 var ( 400 failed int 401 added bool 402 ) 403 add := func(cfg Config) { 404 for _, p := range m.providers { 405 if reflect.DeepEqual(cfg, p.config) { 406 p.newSubs[setName] = struct{}{} 407 added = true 408 return 409 } 410 } 411 typ := cfg.Name() 412 d, err := cfg.NewDiscoverer(DiscovererOptions{ 413 Logger: m.logger.WithField("discovery", typ), 414 }) 415 if err != nil { 416 m.logger.WithError(err). 417 WithField("type", typ). 418 Errorf("cannot create service discovery") 419 failed++ 420 return 421 } 422 m.providers = append(m.providers, &provider{ 423 name: fmt.Sprintf("%s/%d", typ, m.lastProvider), 424 d: d, 425 config: cfg, 426 newSubs: map[string]struct{}{ 427 setName: {}, 428 }, 429 }) 430 m.lastProvider++ 431 added = true 432 } 433 for _, cfg := range cfgs { 434 add(cfg) 435 } 436 if !added { 437 // Add an empty target group to force the refresh of the corresponding 438 // scrape pool and to notify the receiver that this target set has no 439 // current targets. 440 // It can happen because the combined set of SD configurations is empty 441 // or because we fail to instantiate all the SD configurations. 442 add(StaticConfig{{}}) 443 } 444 return failed 445 } 446 447 // StaticProvider holds a list of target groups that never change. 448 type StaticProvider struct { 449 TargetGroups []*targetgroup.Group 450 } 451 452 // Run implements the Worker interface. 453 func (sd *StaticProvider) Run(ctx context.Context, ch chan<- []*targetgroup.Group) { 454 // We still have to consider that the consumer exits right away in which case 455 // the context will be canceled. 456 select { 457 case ch <- sd.TargetGroups: 458 case <-ctx.Done(): 459 } 460 close(ch) 461 }