github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/client/distributor/placement/adaptiveplacement/placement_manager.go (about) 1 package adaptiveplacement 2 3 import ( 4 "context" 5 "errors" 6 "strconv" 7 "sync/atomic" 8 "time" 9 10 "github.com/go-kit/log" 11 "github.com/go-kit/log/level" 12 "github.com/grafana/dskit/services" 13 "github.com/prometheus/client_golang/prometheus" 14 15 "github.com/grafana/pyroscope/pkg/iter" 16 "github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement/adaptiveplacement/adaptive_placementpb" 17 "github.com/grafana/pyroscope/pkg/util" 18 ) 19 20 // Manager maintains placement rules and distribution stats in the store. 21 // 22 // Manager implements services.Service interface for convenience, but it's 23 // meant to be started and stopped explicitly via Start and Stop calls. 24 // 25 // If manager is being stopped while updating rules, an ongoing attempt is 26 // not aborted: we're interested in finishing the operation so that the rules 27 // reflect the most recent statistics. Another reason is that another instance 28 // might be already running at the Stop call time. 29 // 30 // When just started, the manager may not have enough statistics to build 31 // the rules: StatsConfidencePeriod should expire before the first update. 32 // Note that ruler won't downscale datasets for a certain period of time 33 // after the ruler is created regardless of the confidence period. Therefore, 34 // it's generally safe to publish rules even with incomplete statistics; 35 // however, this allows for delays in response to changes of the data flow. 36 type Manager struct { 37 started atomic.Bool 38 startedAt time.Time 39 40 service services.Service 41 logger log.Logger 42 config Config 43 limits Limits 44 metrics *managerMetrics 45 46 store Store 47 stats *DistributionStats 48 ruler *Ruler 49 } 50 51 func NewManager( 52 logger log.Logger, 53 reg prometheus.Registerer, 54 config Config, 55 limits Limits, 56 store Store, 57 ) *Manager { 58 m := &Manager{ 59 logger: logger, 60 config: config, 61 limits: limits, 62 store: store, 63 stats: NewDistributionStats(config.StatsAggregationWindow), 64 metrics: newManagerMetrics(reg), 65 } 66 m.service = services.NewTimerService( 67 config.PlacementUpdateInterval, 68 m.starting, 69 m.updateRulesNoError, 70 m.stopping, 71 ) 72 return m 73 } 74 75 func (m *Manager) Service() services.Service { return m.service } 76 77 func (m *Manager) RecordStats(samples iter.Iterator[Sample]) { m.stats.RecordStats(samples) } 78 79 func (m *Manager) Start() { m.started.Store(true) } 80 func (m *Manager) Stop() { m.started.Store(false) } 81 82 func (m *Manager) starting(context.Context) error { return nil } 83 func (m *Manager) stopping(error) error { return nil } 84 85 // The function is only needed to satisfy the services.OneIteration 86 // signature: there's no case when the service stops on its own: 87 // it's better to serve outdated rules than to not serve at all. 88 func (m *Manager) updateRulesNoError(ctx context.Context) error { 89 util.Recover(func() { m.updateRules(ctx) }) 90 return nil 91 } 92 93 func (m *Manager) updateRules(ctx context.Context) { 94 if !m.started.Load() { 95 m.reset() 96 return 97 } 98 // Initialize the ruler if it's the first run after start. 99 if m.ruler == nil && !m.loadRules(ctx) { 100 return 101 } 102 103 // Cleanup outdated data first: note that when we load the 104 // rules from the store we don't check how old they are. 105 now := time.Now() 106 m.ruler.Expire(now.Add(-m.config.PlacementRetentionPeriod)) 107 m.stats.Expire(now.Add(-m.config.StatsRetentionPeriod)) 108 109 stats := m.stats.Build() 110 rules := m.ruler.BuildRules(stats) 111 112 m.metrics.rulesTotal.Set(float64(len(rules.Datasets))) 113 m.metrics.statsTotal.Set(float64(len(stats.Datasets))) 114 115 if time.Since(m.startedAt) < m.config.StatsConfidencePeriod { 116 _ = level.Debug(m.logger).Log("msg", "confidence period not expired, skipping update") 117 return 118 } 119 120 if err := m.store.StoreRules(ctx, rules); err != nil { 121 _ = level.Error(m.logger).Log("msg", "failed to store placement rules", "err", err) 122 } else { 123 m.metrics.lastUpdate.SetToCurrentTime() 124 _ = level.Debug(m.logger).Log( 125 "msg", "placement rules updated", 126 "datasets", len(rules.Datasets), 127 "created_at", time.Unix(0, rules.CreatedAt), 128 ) 129 } 130 131 if err := m.store.StoreStats(ctx, stats); err != nil { 132 _ = level.Error(m.logger).Log("msg", "failed to store stats", "err", err) 133 } else { 134 _ = level.Debug(m.logger).Log( 135 "msg", "placement stats updated", 136 "datasets", len(rules.Datasets), 137 "created_at", time.Unix(0, rules.CreatedAt), 138 ) 139 } 140 141 m.exportMetrics(rules, stats) 142 } 143 144 func (m *Manager) reset() { 145 // Note that we only reset the ruler here, but not the stats: 146 // there's no harm in old samples as long as they are within 147 // the retention period. 148 m.ruler = nil 149 m.metrics.rulesTotal.Set(0) 150 m.metrics.statsTotal.Set(0) 151 m.metrics.datasetShardLimit.Reset() 152 m.metrics.datasetShardUsage.Reset() 153 m.metrics.datasetShardUsageBreakdown.Reset() 154 } 155 156 func (m *Manager) loadRules(ctx context.Context) bool { 157 rules, err := m.store.LoadRules(ctx) 158 switch { 159 case err == nil: 160 case errors.Is(err, ErrRulesNotFound): 161 _ = level.Warn(m.logger).Log("msg", "placement rules not found") 162 rules = &adaptive_placementpb.PlacementRules{CreatedAt: time.Now().UnixNano()} 163 default: 164 _ = level.Error(m.logger).Log("msg", "failed to load placement rules", "err", err) 165 return false 166 } 167 if m.ruler == nil { 168 m.ruler = NewRuler(m.limits) 169 m.startedAt = time.Now() 170 } 171 m.ruler.Load(rules) 172 return true 173 } 174 175 func (m *Manager) exportMetrics( 176 rules *adaptive_placementpb.PlacementRules, 177 stats *adaptive_placementpb.DistributionStats, 178 ) { 179 if m.config.ExportShardLimitMetrics { 180 for _, dataset := range rules.Datasets { 181 m.metrics.datasetShardLimit.WithLabelValues( 182 rules.Tenants[dataset.Tenant].TenantId, 183 dataset.Name, 184 strconv.Itoa(int(dataset.LoadBalancing))). 185 Set(float64(dataset.DatasetShardLimit)) 186 } 187 } 188 189 if m.config.ExportShardUsageMetrics { 190 for _, dataset := range stats.Datasets { 191 m.metrics.datasetShardUsage.WithLabelValues( 192 stats.Tenants[dataset.Tenant].TenantId, 193 dataset.Name). 194 Set(float64(sum(dataset.Usage))) 195 } 196 } 197 198 if m.config.ExportShardUsageBreakdownMetrics { 199 for _, dataset := range stats.Datasets { 200 for i, ds := range dataset.Shards { 201 m.metrics.datasetShardUsageBreakdown.WithLabelValues( 202 stats.Tenants[dataset.Tenant].TenantId, 203 dataset.Name, 204 strconv.Itoa(int(stats.Shards[ds].Id)), 205 stats.Shards[ds].Owner). 206 Set(float64(dataset.Usage[i])) 207 } 208 } 209 } 210 }