github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/client/distributor/placement/adaptiveplacement/placement_manager.go (about)

     1  package adaptiveplacement
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"strconv"
     7  	"sync/atomic"
     8  	"time"
     9  
    10  	"github.com/go-kit/log"
    11  	"github.com/go-kit/log/level"
    12  	"github.com/grafana/dskit/services"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  
    15  	"github.com/grafana/pyroscope/pkg/iter"
    16  	"github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement/adaptiveplacement/adaptive_placementpb"
    17  	"github.com/grafana/pyroscope/pkg/util"
    18  )
    19  
    20  // Manager maintains placement rules and distribution stats in the store.
    21  //
    22  // Manager implements services.Service interface for convenience, but it's
    23  // meant to be started and stopped explicitly via Start and Stop calls.
    24  //
    25  // If manager is being stopped while updating rules, an ongoing attempt is
    26  // not aborted: we're interested in finishing the operation so that the rules
    27  // reflect the most recent statistics. Another reason is that another instance
    28  // might be already running at the Stop call time.
    29  //
    30  // When just started, the manager may not have enough statistics to build
    31  // the rules: StatsConfidencePeriod should expire before the first update.
    32  // Note that ruler won't downscale datasets for a certain period of time
    33  // after the ruler is created regardless of the confidence period. Therefore,
    34  // it's generally safe to publish rules even with incomplete statistics;
    35  // however, this allows for delays in response to changes of the data flow.
    36  type Manager struct {
    37  	started   atomic.Bool
    38  	startedAt time.Time
    39  
    40  	service services.Service
    41  	logger  log.Logger
    42  	config  Config
    43  	limits  Limits
    44  	metrics *managerMetrics
    45  
    46  	store Store
    47  	stats *DistributionStats
    48  	ruler *Ruler
    49  }
    50  
    51  func NewManager(
    52  	logger log.Logger,
    53  	reg prometheus.Registerer,
    54  	config Config,
    55  	limits Limits,
    56  	store Store,
    57  ) *Manager {
    58  	m := &Manager{
    59  		logger:  logger,
    60  		config:  config,
    61  		limits:  limits,
    62  		store:   store,
    63  		stats:   NewDistributionStats(config.StatsAggregationWindow),
    64  		metrics: newManagerMetrics(reg),
    65  	}
    66  	m.service = services.NewTimerService(
    67  		config.PlacementUpdateInterval,
    68  		m.starting,
    69  		m.updateRulesNoError,
    70  		m.stopping,
    71  	)
    72  	return m
    73  }
    74  
    75  func (m *Manager) Service() services.Service { return m.service }
    76  
    77  func (m *Manager) RecordStats(samples iter.Iterator[Sample]) { m.stats.RecordStats(samples) }
    78  
    79  func (m *Manager) Start() { m.started.Store(true) }
    80  func (m *Manager) Stop()  { m.started.Store(false) }
    81  
    82  func (m *Manager) starting(context.Context) error { return nil }
    83  func (m *Manager) stopping(error) error           { return nil }
    84  
    85  // The function is only needed to satisfy the services.OneIteration
    86  // signature: there's no case when the service stops on its own:
    87  // it's better to serve outdated rules than to not serve at all.
    88  func (m *Manager) updateRulesNoError(ctx context.Context) error {
    89  	util.Recover(func() { m.updateRules(ctx) })
    90  	return nil
    91  }
    92  
    93  func (m *Manager) updateRules(ctx context.Context) {
    94  	if !m.started.Load() {
    95  		m.reset()
    96  		return
    97  	}
    98  	// Initialize the ruler if it's the first run after start.
    99  	if m.ruler == nil && !m.loadRules(ctx) {
   100  		return
   101  	}
   102  
   103  	// Cleanup outdated data first: note that when we load the
   104  	// rules from the store we don't check how old they are.
   105  	now := time.Now()
   106  	m.ruler.Expire(now.Add(-m.config.PlacementRetentionPeriod))
   107  	m.stats.Expire(now.Add(-m.config.StatsRetentionPeriod))
   108  
   109  	stats := m.stats.Build()
   110  	rules := m.ruler.BuildRules(stats)
   111  
   112  	m.metrics.rulesTotal.Set(float64(len(rules.Datasets)))
   113  	m.metrics.statsTotal.Set(float64(len(stats.Datasets)))
   114  
   115  	if time.Since(m.startedAt) < m.config.StatsConfidencePeriod {
   116  		_ = level.Debug(m.logger).Log("msg", "confidence period not expired, skipping update")
   117  		return
   118  	}
   119  
   120  	if err := m.store.StoreRules(ctx, rules); err != nil {
   121  		_ = level.Error(m.logger).Log("msg", "failed to store placement rules", "err", err)
   122  	} else {
   123  		m.metrics.lastUpdate.SetToCurrentTime()
   124  		_ = level.Debug(m.logger).Log(
   125  			"msg", "placement rules updated",
   126  			"datasets", len(rules.Datasets),
   127  			"created_at", time.Unix(0, rules.CreatedAt),
   128  		)
   129  	}
   130  
   131  	if err := m.store.StoreStats(ctx, stats); err != nil {
   132  		_ = level.Error(m.logger).Log("msg", "failed to store stats", "err", err)
   133  	} else {
   134  		_ = level.Debug(m.logger).Log(
   135  			"msg", "placement stats updated",
   136  			"datasets", len(rules.Datasets),
   137  			"created_at", time.Unix(0, rules.CreatedAt),
   138  		)
   139  	}
   140  
   141  	m.exportMetrics(rules, stats)
   142  }
   143  
   144  func (m *Manager) reset() {
   145  	// Note that we only reset the ruler here, but not the stats:
   146  	// there's no harm in old samples as long as they are within
   147  	// the retention period.
   148  	m.ruler = nil
   149  	m.metrics.rulesTotal.Set(0)
   150  	m.metrics.statsTotal.Set(0)
   151  	m.metrics.datasetShardLimit.Reset()
   152  	m.metrics.datasetShardUsage.Reset()
   153  	m.metrics.datasetShardUsageBreakdown.Reset()
   154  }
   155  
   156  func (m *Manager) loadRules(ctx context.Context) bool {
   157  	rules, err := m.store.LoadRules(ctx)
   158  	switch {
   159  	case err == nil:
   160  	case errors.Is(err, ErrRulesNotFound):
   161  		_ = level.Warn(m.logger).Log("msg", "placement rules not found")
   162  		rules = &adaptive_placementpb.PlacementRules{CreatedAt: time.Now().UnixNano()}
   163  	default:
   164  		_ = level.Error(m.logger).Log("msg", "failed to load placement rules", "err", err)
   165  		return false
   166  	}
   167  	if m.ruler == nil {
   168  		m.ruler = NewRuler(m.limits)
   169  		m.startedAt = time.Now()
   170  	}
   171  	m.ruler.Load(rules)
   172  	return true
   173  }
   174  
   175  func (m *Manager) exportMetrics(
   176  	rules *adaptive_placementpb.PlacementRules,
   177  	stats *adaptive_placementpb.DistributionStats,
   178  ) {
   179  	if m.config.ExportShardLimitMetrics {
   180  		for _, dataset := range rules.Datasets {
   181  			m.metrics.datasetShardLimit.WithLabelValues(
   182  				rules.Tenants[dataset.Tenant].TenantId,
   183  				dataset.Name,
   184  				strconv.Itoa(int(dataset.LoadBalancing))).
   185  				Set(float64(dataset.DatasetShardLimit))
   186  		}
   187  	}
   188  
   189  	if m.config.ExportShardUsageMetrics {
   190  		for _, dataset := range stats.Datasets {
   191  			m.metrics.datasetShardUsage.WithLabelValues(
   192  				stats.Tenants[dataset.Tenant].TenantId,
   193  				dataset.Name).
   194  				Set(float64(sum(dataset.Usage)))
   195  		}
   196  	}
   197  
   198  	if m.config.ExportShardUsageBreakdownMetrics {
   199  		for _, dataset := range stats.Datasets {
   200  			for i, ds := range dataset.Shards {
   201  				m.metrics.datasetShardUsageBreakdown.WithLabelValues(
   202  					stats.Tenants[dataset.Tenant].TenantId,
   203  					dataset.Name,
   204  					strconv.Itoa(int(stats.Shards[ds].Id)),
   205  					stats.Shards[ds].Owner).
   206  					Set(float64(dataset.Usage[i]))
   207  			}
   208  		}
   209  	}
   210  }