github.com/grafana/pyroscope@v1.18.0/pkg/ingester/limiter.go

github.com/grafana/pyroscope@v1.18.0/pkg/ingester/limiter.go (about)

     1  package ingester
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/prometheus/common/model"
     9  	"github.com/samber/lo"
    10  
    11  	phlaremodel "github.com/grafana/pyroscope/pkg/model"
    12  	"github.com/grafana/pyroscope/pkg/util"
    13  	"github.com/grafana/pyroscope/pkg/validation"
    14  )
    15  
    16  var (
    17  	activeSeriesTimeout = 10 * time.Minute
    18  	activeSeriesCleanup = time.Minute
    19  )
    20  
    21  type RingCount interface {
    22  	HealthyInstancesCount() int
    23  }
    24  
    25  type Limits interface {
    26  	MaxLocalSeriesPerTenant(tenantID string) int
    27  	MaxGlobalSeriesPerTenant(tenantID string) int
    28  	IngestionTenantShardSize(tenantID string) int
    29  	DistributorUsageGroups(tenantID string) *validation.UsageGroupConfig
    30  }
    31  
    32  type Limiter interface {
    33  	// AllowProfile returns an error if the profile is not allowed to be ingested.
    34  	// The error is a validation error and can be out of order or max series limit reached.
    35  	AllowProfile(fp model.Fingerprint, lbs phlaremodel.Labels, tsNano int64) error
    36  	Stop()
    37  }
    38  
    39  type limiter struct {
    40  	limits            Limits
    41  	ring              RingCount
    42  	replicationFactor int
    43  	tenantID          string
    44  
    45  	activeSeries map[model.Fingerprint]int64
    46  
    47  	mtx sync.Mutex // todo: may be shard the lock to avoid latency spikes.
    48  
    49  	ctx    context.Context
    50  	cancel context.CancelFunc
    51  	wg     sync.WaitGroup
    52  }
    53  
    54  func NewLimiter(tenantID string, limits Limits, ring RingCount, replicationFactor int) Limiter {
    55  	ctx, cancel := context.WithCancel(context.Background())
    56  
    57  	l := &limiter{
    58  		tenantID:          tenantID,
    59  		limits:            limits,
    60  		ring:              ring,
    61  		replicationFactor: replicationFactor,
    62  		activeSeries:      map[model.Fingerprint]int64{},
    63  		cancel:            cancel,
    64  		ctx:               ctx,
    65  	}
    66  
    67  	l.wg.Add(1)
    68  	go l.loop()
    69  
    70  	return l
    71  }
    72  
    73  func (l *limiter) Stop() {
    74  	l.cancel()
    75  	l.wg.Wait()
    76  }
    77  
    78  func (l *limiter) loop() {
    79  	defer l.wg.Done()
    80  
    81  	ticker := time.NewTicker(activeSeriesCleanup)
    82  	defer ticker.Stop()
    83  
    84  	for {
    85  		select {
    86  		case <-ticker.C:
    87  			l.cleanup()
    88  		case <-l.ctx.Done():
    89  			return
    90  		}
    91  	}
    92  }
    93  
    94  // cleanup removes the series that have not been used for a while.
    95  func (l *limiter) cleanup() {
    96  	now := time.Now().UnixNano()
    97  	l.mtx.Lock()
    98  	defer l.mtx.Unlock()
    99  
   100  	for fp, lastUsed := range l.activeSeries {
   101  		if now-lastUsed > int64(activeSeriesTimeout) {
   102  			delete(l.activeSeries, fp)
   103  		}
   104  	}
   105  }
   106  
   107  func (l *limiter) AllowProfile(fp model.Fingerprint, lbs phlaremodel.Labels, tsNano int64) error {
   108  	l.mtx.Lock()
   109  	defer l.mtx.Unlock()
   110  	return l.allowNewSeries(fp)
   111  }
   112  
   113  func (l *limiter) allowNewSeries(fp model.Fingerprint) error {
   114  	_, ok := l.activeSeries[fp]
   115  	series := len(l.activeSeries)
   116  	if !ok {
   117  		// can this series be added?
   118  		if err := l.assertMaxSeriesPerUser(l.tenantID, series); err != nil {
   119  			return err
   120  		}
   121  	}
   122  
   123  	// update time or add it
   124  	l.activeSeries[fp] = time.Now().UnixNano()
   125  	return nil
   126  }
   127  
   128  func (l *limiter) assertMaxSeriesPerUser(tenantID string, series int) error {
   129  	// Start by setting the local limit either from override or default
   130  	localLimit := l.limits.MaxLocalSeriesPerTenant(tenantID)
   131  
   132  	// We can assume that series are evenly distributed across ingesters
   133  	// so we do convert the global limit into a local limit
   134  	globalLimit := l.limits.MaxGlobalSeriesPerTenant(tenantID)
   135  	adjustedGlobalLimit := l.convertGlobalToLocalLimit(tenantID, globalLimit)
   136  
   137  	// Set the calculated limit to the lesser of the local limit or the new calculated global limit
   138  	calculatedLimit := minNonZero(localLimit, adjustedGlobalLimit)
   139  
   140  	// If both the local and global limits are disabled, we just
   141  	// use the largest int value
   142  	if calculatedLimit == 0 {
   143  		return nil
   144  	}
   145  
   146  	if series < calculatedLimit {
   147  		return nil
   148  	}
   149  	return validation.NewErrorf(validation.SeriesLimit, validation.SeriesLimitErrorMsg, series, calculatedLimit)
   150  }
   151  
   152  func (l *limiter) convertGlobalToLocalLimit(tenantID string, globalLimit int) int {
   153  	if globalLimit == 0 {
   154  		return 0
   155  	}
   156  
   157  	// Given we don't need a super accurate count (ie. when the ingesters
   158  	// topology changes) and we prefer to always be in favor of the tenant,
   159  	// we can use a per-ingester limit equal to:
   160  	// (global limit / number of ingesters) * replication factor
   161  	numIngesters := l.ring.HealthyInstancesCount()
   162  
   163  	// No healthy ingester may happen because the number of ingesters is asynchronously updated.
   164  	// If happens, we just temporarily ignore the global limit.
   165  	if numIngesters == 0 {
   166  		return 0
   167  	}
   168  
   169  	// If the number of available ingesters is greater than the tenant's shard
   170  	// size, then we should honor the shard size because series/metadata won't
   171  	// be written to more ingesters than it.
   172  	if shardSize := l.limits.IngestionTenantShardSize(tenantID); shardSize > 0 {
   173  		// We use Min() to protect from the case the expected shard size is > available ingesters.
   174  		numIngesters = lo.Min([]int{numIngesters, util.ShuffleShardExpectedInstances(shardSize, 1)})
   175  	}
   176  
   177  	return int((float64(globalLimit) / float64(numIngesters)) * float64(l.replicationFactor))
   178  }
   179  
   180  func minNonZero(first, second int) int {
   181  	if first == 0 || (second != 0 && first > second) {
   182  		return second
   183  	}
   184  
   185  	return first
   186  }