github.com/letsencrypt/boulder@v0.20251208.0/ratelimits/limiter.go (about)

     1  package ratelimits
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  	"math/rand/v2"
     9  	"slices"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/jmhodges/clock"
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"github.com/prometheus/client_golang/prometheus/promauto"
    16  
    17  	berrors "github.com/letsencrypt/boulder/errors"
    18  )
    19  
    20  const (
    21  	// Allowed is used for rate limit metrics, it's the value of the 'decision'
    22  	// label when a request was allowed.
    23  	Allowed = "allowed"
    24  
    25  	// Denied is used for rate limit metrics, it's the value of the 'decision'
    26  	// label when a request was denied.
    27  	Denied = "denied"
    28  )
    29  
    30  // allowedDecision is an "allowed" *Decision that should be returned when a
    31  // checked limit is found to be disabled.
    32  var allowedDecision = &Decision{allowed: true, remaining: math.MaxInt64}
    33  
    34  // Limiter provides a high-level interface for rate limiting requests by
    35  // utilizing a token bucket-style approach.
    36  type Limiter struct {
    37  	// source is used to store buckets. It must be safe for concurrent use.
    38  	source Source
    39  	clk    clock.Clock
    40  
    41  	spendLatency *prometheus.HistogramVec
    42  }
    43  
    44  // NewLimiter returns a new *Limiter. The provided source must be safe for
    45  // concurrent use.
    46  func NewLimiter(clk clock.Clock, source Source, stats prometheus.Registerer) (*Limiter, error) {
    47  	spendLatency := promauto.With(stats).NewHistogramVec(prometheus.HistogramOpts{
    48  		Name: "ratelimits_spend_latency",
    49  		Help: fmt.Sprintf("Latency of ratelimit checks labeled by limit=[name] and decision=[%s|%s], in seconds", Allowed, Denied),
    50  		// Exponential buckets ranging from 0.0005s to 3s.
    51  		Buckets: prometheus.ExponentialBuckets(0.0005, 3, 8),
    52  	}, []string{"limit", "decision"})
    53  
    54  	return &Limiter{
    55  		source:       source,
    56  		clk:          clk,
    57  		spendLatency: spendLatency,
    58  	}, nil
    59  }
    60  
    61  // Decision represents the result of a rate limit check or spend operation. To
    62  // check the result of a *Decision, call the Result() method.
    63  type Decision struct {
    64  	// allowed is true if the bucket possessed enough capacity to allow the
    65  	// request given the cost.
    66  	allowed bool
    67  
    68  	// remaining is the number of requests the client is allowed to make before
    69  	// they're rate limited.
    70  	remaining int64
    71  
    72  	// retryIn is the duration the client MUST wait before they're allowed to
    73  	// make a request.
    74  	retryIn time.Duration
    75  
    76  	// resetIn is the duration the bucket will take to refill to its maximum
    77  	// capacity, assuming no further requests are made.
    78  	resetIn time.Duration
    79  
    80  	// newTAT indicates the time at which the bucket will be full. It is the
    81  	// theoretical arrival time (TAT) of next request. It must be no more than
    82  	// (burst * (period / count)) in the future at any single point in time.
    83  	newTAT time.Time
    84  
    85  	// transaction is the Transaction that resulted in this Decision. It is
    86  	// included for the production of verbose Subscriber-facing errors. It is
    87  	// set by the Limiter before returning the Decision.
    88  	transaction Transaction
    89  }
    90  
    91  // Result translates a denied *Decision into a berrors.RateLimitError for the
    92  // Subscriber, or returns nil if the *Decision allows the request. The error
    93  // message includes a human-readable description of the exceeded rate limit and
    94  // a retry-after timestamp.
    95  func (d *Decision) Result(now time.Time) error {
    96  	if d.allowed {
    97  		return nil
    98  	}
    99  
   100  	// Add 0-3% jitter to the RetryIn duration to prevent thundering herd.
   101  	jitter := time.Duration(float64(d.retryIn) * 0.03 * rand.Float64())
   102  	retryAfter := d.retryIn + jitter
   103  	retryAfterTs := now.UTC().Add(retryAfter).Format("2006-01-02 15:04:05 MST")
   104  
   105  	// There is no case for FailedAuthorizationsForPausingPerDomainPerAccount
   106  	// because the RA will pause clients who exceed that ratelimit.
   107  	switch d.transaction.limit.Name {
   108  	case NewRegistrationsPerIPAddress:
   109  		return berrors.RegistrationsPerIPAddressError(
   110  			retryAfter,
   111  			"too many new registrations (%d) from this IP address in the last %s, retry after %s",
   112  			d.transaction.limit.Burst,
   113  			d.transaction.limit.Period.Duration,
   114  			retryAfterTs,
   115  		)
   116  
   117  	case NewRegistrationsPerIPv6Range:
   118  		return berrors.RegistrationsPerIPv6RangeError(
   119  			retryAfter,
   120  			"too many new registrations (%d) from this /48 subnet of IPv6 addresses in the last %s, retry after %s",
   121  			d.transaction.limit.Burst,
   122  			d.transaction.limit.Period.Duration,
   123  			retryAfterTs,
   124  		)
   125  	case NewOrdersPerAccount:
   126  		return berrors.NewOrdersPerAccountError(
   127  			retryAfter,
   128  			"too many new orders (%d) from this account in the last %s, retry after %s",
   129  			d.transaction.limit.Burst,
   130  			d.transaction.limit.Period.Duration,
   131  			retryAfterTs,
   132  		)
   133  
   134  	case FailedAuthorizationsPerDomainPerAccount:
   135  		// Uses bucket key 'enum:regId:identValue'.
   136  		idx := strings.LastIndex(d.transaction.bucketKey, ":")
   137  		if idx == -1 {
   138  			return berrors.InternalServerError("unrecognized bucket key while generating error")
   139  		}
   140  		identValue := d.transaction.bucketKey[idx+1:]
   141  		return berrors.FailedAuthorizationsPerDomainPerAccountError(
   142  			retryAfter,
   143  			"too many failed authorizations (%d) for %q in the last %s, retry after %s",
   144  			d.transaction.limit.Burst,
   145  			identValue,
   146  			d.transaction.limit.Period.Duration,
   147  			retryAfterTs,
   148  		)
   149  
   150  	case CertificatesPerDomain, CertificatesPerDomainPerAccount:
   151  		// Uses bucket key 'enum:domainOrCIDR' or 'enum:regId:domainOrCIDR' respectively.
   152  		idx := strings.LastIndex(d.transaction.bucketKey, ":")
   153  		if idx == -1 {
   154  			return berrors.InternalServerError("unrecognized bucket key while generating error")
   155  		}
   156  		domainOrCIDR := d.transaction.bucketKey[idx+1:]
   157  		return berrors.CertificatesPerDomainError(
   158  			retryAfter,
   159  			"too many certificates (%d) already issued for %q in the last %s, retry after %s",
   160  			d.transaction.limit.Burst,
   161  			domainOrCIDR,
   162  			d.transaction.limit.Period.Duration,
   163  			retryAfterTs,
   164  		)
   165  
   166  	case CertificatesPerFQDNSet:
   167  		return berrors.CertificatesPerFQDNSetError(
   168  			retryAfter,
   169  			"too many certificates (%d) already issued for this exact set of identifiers in the last %s, retry after %s",
   170  			d.transaction.limit.Burst,
   171  			d.transaction.limit.Period.Duration,
   172  			retryAfterTs,
   173  		)
   174  
   175  	case LimitOverrideRequestsPerIPAddress:
   176  		return berrors.LimitOverrideRequestsPerIPAddressError(
   177  			retryAfter,
   178  			"too many override request form submissions (%d) from this IP address in the last %s, retry after %s",
   179  			d.transaction.limit.Burst,
   180  			d.transaction.limit.Period.Duration,
   181  			retryAfterTs,
   182  		)
   183  
   184  	default:
   185  		return berrors.InternalServerError("cannot generate error for unknown rate limit")
   186  	}
   187  }
   188  
   189  // Check DOES NOT deduct the cost of the request from the provided bucket's
   190  // capacity. The returned *Decision indicates whether the capacity exists to
   191  // satisfy the cost and represents the hypothetical state of the bucket IF the
   192  // cost WERE to be deducted. If no bucket exists it will NOT be created. No
   193  // state is persisted to the underlying datastore.
   194  func (l *Limiter) Check(ctx context.Context, txn Transaction) (*Decision, error) {
   195  	if txn.allowOnly() {
   196  		return allowedDecision, nil
   197  	}
   198  	// Remove cancellation from the request context so that transactions are not
   199  	// interrupted by a client disconnect.
   200  	ctx = context.WithoutCancel(ctx)
   201  	tat, err := l.source.Get(ctx, txn.bucketKey)
   202  	if err != nil {
   203  		if !errors.Is(err, ErrBucketNotFound) {
   204  			return nil, err
   205  		}
   206  		// First request from this client. No need to initialize the bucket
   207  		// because this is a check, not a spend. A TAT of "now" is equivalent to
   208  		// a full bucket.
   209  		return maybeSpend(l.clk, txn, l.clk.Now()), nil
   210  	}
   211  	return maybeSpend(l.clk, txn, tat), nil
   212  }
   213  
   214  // Spend attempts to deduct the cost from the provided bucket's capacity. The
   215  // returned *Decision indicates whether the capacity existed to satisfy the cost
   216  // and represents the current state of the bucket. If no bucket exists it WILL
   217  // be created WITH the cost factored into its initial state. The new bucket
   218  // state is persisted to the underlying datastore, if applicable, before
   219  // returning.
   220  func (l *Limiter) Spend(ctx context.Context, txn Transaction) (*Decision, error) {
   221  	return l.BatchSpend(ctx, []Transaction{txn})
   222  }
   223  
   224  func prepareBatch(txns []Transaction) ([]Transaction, []string, error) {
   225  	var bucketKeys []string
   226  	var transactions []Transaction
   227  	for _, txn := range txns {
   228  		if txn.allowOnly() {
   229  			// Ignore allow-only transactions.
   230  			continue
   231  		}
   232  		if slices.Contains(bucketKeys, txn.bucketKey) {
   233  			return nil, nil, fmt.Errorf("found duplicate bucket %q in batch", txn.bucketKey)
   234  		}
   235  		bucketKeys = append(bucketKeys, txn.bucketKey)
   236  		transactions = append(transactions, txn)
   237  	}
   238  	return transactions, bucketKeys, nil
   239  }
   240  
   241  func stricter(existing *Decision, incoming *Decision) *Decision {
   242  	if existing.retryIn == incoming.retryIn {
   243  		if existing.remaining < incoming.remaining {
   244  			return existing
   245  		}
   246  		return incoming
   247  	}
   248  	if existing.retryIn > incoming.retryIn {
   249  		return existing
   250  	}
   251  	return incoming
   252  }
   253  
   254  // BatchSpend attempts to deduct the costs from the provided buckets'
   255  // capacities. If applicable, new bucket states are persisted to the underlying
   256  // datastore before returning. Non-existent buckets will be initialized WITH the
   257  // cost factored into the initial state. The returned *Decision represents the
   258  // strictest of all *Decisions reached in the batch.
   259  func (l *Limiter) BatchSpend(ctx context.Context, txns []Transaction) (*Decision, error) {
   260  	start := l.clk.Now()
   261  
   262  	batch, bucketKeys, err := prepareBatch(txns)
   263  	if err != nil {
   264  		return nil, err
   265  	}
   266  	if len(batch) == 0 {
   267  		// All Transactions were allow-only.
   268  		return allowedDecision, nil
   269  	}
   270  
   271  	// Remove cancellation from the request context so that transactions are not
   272  	// interrupted by a client disconnect.
   273  	ctx = context.WithoutCancel(ctx)
   274  	tats, err := l.source.BatchGet(ctx, bucketKeys)
   275  	if err != nil {
   276  		return nil, fmt.Errorf("batch get for %d keys: %w", len(bucketKeys), err)
   277  	}
   278  	batchDecision := allowedDecision
   279  	newBuckets := make(map[string]time.Time)
   280  	incrBuckets := make(map[string]increment)
   281  	staleBuckets := make(map[string]time.Time)
   282  	txnOutcomes := make(map[Transaction]string)
   283  
   284  	for _, txn := range batch {
   285  		storedTAT, bucketExists := tats[txn.bucketKey]
   286  		d := maybeSpend(l.clk, txn, storedTAT)
   287  
   288  		if d.allowed && (storedTAT != d.newTAT) && txn.spend {
   289  			if !bucketExists {
   290  				newBuckets[txn.bucketKey] = d.newTAT
   291  			} else if storedTAT.After(l.clk.Now()) {
   292  				incrBuckets[txn.bucketKey] = increment{
   293  					cost: time.Duration(txn.cost * txn.limit.emissionInterval),
   294  					ttl:  time.Duration(txn.limit.burstOffset),
   295  				}
   296  			} else {
   297  				staleBuckets[txn.bucketKey] = d.newTAT
   298  			}
   299  		}
   300  
   301  		if !txn.spendOnly() {
   302  			// Spend-only Transactions are best-effort and do not contribute to
   303  			// the batchDecision.
   304  			batchDecision = stricter(batchDecision, d)
   305  		}
   306  
   307  		txnOutcomes[txn] = Denied
   308  		if d.allowed {
   309  			txnOutcomes[txn] = Allowed
   310  		}
   311  	}
   312  
   313  	if batchDecision.allowed {
   314  		if len(newBuckets) > 0 {
   315  			// Use BatchSetNotExisting to create new buckets so that we detect
   316  			// if concurrent requests have created this bucket at the same time,
   317  			// which would result in overwriting if we used a plain "SET"
   318  			// command. If that happens, fall back to incrementing.
   319  			alreadyExists, err := l.source.BatchSetNotExisting(ctx, newBuckets)
   320  			if err != nil {
   321  				return nil, fmt.Errorf("batch set for %d keys: %w", len(newBuckets), err)
   322  			}
   323  			// Find the original transaction in order to compute the increment
   324  			// and set the TTL.
   325  			for _, txn := range batch {
   326  				if alreadyExists[txn.bucketKey] {
   327  					incrBuckets[txn.bucketKey] = increment{
   328  						cost: time.Duration(txn.cost * txn.limit.emissionInterval),
   329  						ttl:  time.Duration(txn.limit.burstOffset),
   330  					}
   331  				}
   332  			}
   333  		}
   334  
   335  		if len(incrBuckets) > 0 {
   336  			err = l.source.BatchIncrement(ctx, incrBuckets)
   337  			if err != nil {
   338  				return nil, fmt.Errorf("batch increment for %d keys: %w", len(incrBuckets), err)
   339  			}
   340  		}
   341  
   342  		if len(staleBuckets) > 0 {
   343  			// Incrementing a TAT in the past grants unintended burst capacity.
   344  			// So instead we overwrite it with a TAT of now + increment. This
   345  			// approach may cause a race condition where only the last spend is
   346  			// saved, but it's preferable to the alternative.
   347  			err = l.source.BatchSet(ctx, staleBuckets)
   348  			if err != nil {
   349  				return nil, fmt.Errorf("batch set for %d keys: %w", len(staleBuckets), err)
   350  			}
   351  		}
   352  	}
   353  
   354  	// Observe latency equally across all transactions in the batch.
   355  	totalLatency := l.clk.Since(start)
   356  	perTxnLatency := totalLatency / time.Duration(len(txnOutcomes))
   357  	for txn, outcome := range txnOutcomes {
   358  		l.spendLatency.WithLabelValues(txn.limit.Name.String(), outcome).Observe(perTxnLatency.Seconds())
   359  	}
   360  	return batchDecision, nil
   361  }
   362  
   363  // Refund attempts to refund all of the cost to the capacity of the specified
   364  // bucket. The returned *Decision indicates whether the refund was successful
   365  // and represents the current state of the bucket. The new bucket state is
   366  // persisted to the underlying datastore, if applicable, before returning. If no
   367  // bucket exists it will NOT be created. Spend-only Transactions are assumed to
   368  // be refundable. Check-only Transactions are never refunded.
   369  //
   370  // Note: The amount refunded cannot cause the bucket to exceed its maximum
   371  // capacity. Partial refunds are allowed and are considered successful. For
   372  // instance, if a bucket has a maximum capacity of 10 and currently has 5
   373  // requests remaining, a refund request of 7 will result in the bucket reaching
   374  // its maximum capacity of 10, not 12.
   375  func (l *Limiter) Refund(ctx context.Context, txn Transaction) (*Decision, error) {
   376  	return l.BatchRefund(ctx, []Transaction{txn})
   377  }
   378  
   379  // BatchRefund attempts to refund all or some of the costs to the provided
   380  // buckets' capacities. Non-existent buckets will NOT be initialized. The new
   381  // bucket state is persisted to the underlying datastore, if applicable, before
   382  // returning. Spend-only Transactions are assumed to be refundable. Check-only
   383  // Transactions are never refunded. The returned *Decision represents the
   384  // strictest of all *Decisions reached in the batch.
   385  func (l *Limiter) BatchRefund(ctx context.Context, txns []Transaction) (*Decision, error) {
   386  	batch, bucketKeys, err := prepareBatch(txns)
   387  	if err != nil {
   388  		return nil, err
   389  	}
   390  	if len(batch) == 0 {
   391  		// All Transactions were allow-only.
   392  		return allowedDecision, nil
   393  	}
   394  
   395  	// Remove cancellation from the request context so that transactions are not
   396  	// interrupted by a client disconnect.
   397  	ctx = context.WithoutCancel(ctx)
   398  	tats, err := l.source.BatchGet(ctx, bucketKeys)
   399  	if err != nil {
   400  		return nil, fmt.Errorf("batch get for %d keys: %w", len(bucketKeys), err)
   401  	}
   402  
   403  	batchDecision := allowedDecision
   404  	incrBuckets := make(map[string]increment)
   405  
   406  	for _, txn := range batch {
   407  		tat, bucketExists := tats[txn.bucketKey]
   408  		if !bucketExists {
   409  			// Ignore non-existent bucket.
   410  			continue
   411  		}
   412  
   413  		if txn.checkOnly() {
   414  			// The cost of check-only transactions are never refunded.
   415  			txn.cost = 0
   416  		}
   417  		d := maybeRefund(l.clk, txn, tat)
   418  		batchDecision = stricter(batchDecision, d)
   419  		if d.allowed && tat != d.newTAT {
   420  			// New bucket state should be persisted.
   421  			incrBuckets[txn.bucketKey] = increment{
   422  				cost: time.Duration(-txn.cost * txn.limit.emissionInterval),
   423  				ttl:  time.Duration(txn.limit.burstOffset),
   424  			}
   425  		}
   426  	}
   427  
   428  	if len(incrBuckets) > 0 {
   429  		err = l.source.BatchIncrement(ctx, incrBuckets)
   430  		if err != nil {
   431  			return nil, fmt.Errorf("batch increment for %d keys: %w", len(incrBuckets), err)
   432  		}
   433  	}
   434  	return batchDecision, nil
   435  }
   436  
   437  // BatchReset resets the specified buckets to their maximum capacity using the
   438  // provided reset Transactions. The new bucket state is persisted to the
   439  // underlying datastore before returning.
   440  func (l *Limiter) BatchReset(ctx context.Context, txns []Transaction) error {
   441  	var bucketKeys []string
   442  	for _, txn := range txns {
   443  		if txn.allowOnly() {
   444  			// Ignore allow-only transactions.
   445  			continue
   446  		}
   447  		if !txn.resetOnly() {
   448  			return fmt.Errorf("found reset-only transaction, received check=%t spend=%t reset=%t", txn.check, txn.spend, txn.reset)
   449  		}
   450  		if slices.Contains(bucketKeys, txn.bucketKey) {
   451  			return fmt.Errorf("found duplicate bucket %q in batch", txn.bucketKey)
   452  		}
   453  		bucketKeys = append(bucketKeys, txn.bucketKey)
   454  	}
   455  	if len(bucketKeys) == 0 {
   456  		return nil
   457  	}
   458  	// Remove cancellation from the request context so that transactions are not
   459  	// interrupted by a client disconnect.
   460  	ctx = context.WithoutCancel(ctx)
   461  	return l.source.BatchDelete(ctx, bucketKeys)
   462  }