github.com/grailbio/base@v0.0.11/admit/admit.go (about)

     1  // Copyright 2018 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package admit contains utilities for admission control.
     6  package admit
     7  
     8  import (
     9  	"context"
    10  	"expvar"
    11  	"sync"
    12  
    13  	"github.com/grailbio/base/log"
    14  	"github.com/grailbio/base/retry"
    15  	"github.com/grailbio/base/sync/ctxsync"
    16  )
    17  
    18  var (
    19  	admitLimit = expvar.NewMap("admit.limit")
    20  	admitUsed  = expvar.NewMap("admit.used")
    21  )
    22  
    23  // Policy implements the low level details of an admission control
    24  // policy. Users typically use a utility function such as admit.Do
    25  // or admit.Retry.
    26  type Policy interface {
    27  	// Acquire acquires a number of tokens from the admission controller.
    28  	// Returns on success, or if the context was canceled.
    29  	// Acquire can also return with an error if the number of requested tokens
    30  	// exceeds the upper limit of available tokens.
    31  	Acquire(ctx context.Context, need int) error
    32  
    33  	// Release a number of tokens to the admission controller,
    34  	// reporting whether the request was within the capacity limits.
    35  	Release(tokens int, ok bool)
    36  }
    37  
    38  // Do calls f after being admitted by the controller. f's bool return value is
    39  // passed on to the underlying policy upon Release, and the error is simply
    40  // returned back to the caller as a convenience.
    41  // If policy is nil, then this will simply call f.
    42  func Do(ctx context.Context, policy Policy, tokens int, f func() (bool, error)) error {
    43  	if policy == nil {
    44  		_, err := f()
    45  		return err
    46  	}
    47  	if err := policy.Acquire(ctx, tokens); err != nil {
    48  		return err
    49  	}
    50  	var (
    51  		ok  bool
    52  		err error
    53  	)
    54  	defer func() { policy.Release(tokens, ok) }()
    55  	ok, err = f()
    56  	return err
    57  }
    58  
    59  // CapacityStatus is the feedback provided by the user to Retry about the underlying resource being managed by Policy.
    60  type CapacityStatus int
    61  
    62  const (
    63  	// Within means that the underlying resource is within capacity.
    64  	Within CapacityStatus = iota
    65  	// OverNoRetry means that the underlying resource is over capacity but no retry is needed.
    66  	// This is useful in situations where a request using the resource succeeded, but there are
    67  	// signs of congestion (for example, in the form of high latency).
    68  	OverNoRetry
    69  	// OverNeedRetry means that the underlying resource is over capacity and a retry is needed.
    70  	// This is useful in situations where requests failed due to the underlying resource hitting capacity limits.
    71  	OverNeedRetry
    72  )
    73  
    74  // RetryPolicy combines an admission controller with a retry policy.
    75  type RetryPolicy interface {
    76  	Policy
    77  	retry.Policy
    78  }
    79  
    80  // Retry calls f after being admitted by the Policy (implied by the given RetryPolicy).
    81  // If f returns Within, true is passed to the underlying policy upon Release and false otherwise.
    82  // If f returns OverNeedRetry, f will be retried as per the RetryPolicy (and the error returned by f is ignored),
    83  // and if f can no longer be retried, the error returned by retry.Policy will be returned.
    84  func Retry(ctx context.Context, policy RetryPolicy, tokens int, f func() (CapacityStatus, error)) error {
    85  	var err error
    86  	for retries := 0; ; retries++ {
    87  		var c CapacityStatus
    88  		err = Do(ctx, policy, tokens, func() (bool, error) {
    89  			var err error // nolint:govet
    90  			c, err = f()
    91  			return c == Within, err
    92  		})
    93  		// Retry as per retry policy if attempt failed due to over capacity.
    94  		if c != OverNeedRetry {
    95  			break
    96  		}
    97  		if err = retry.Wait(ctx, policy, retries); err != nil {
    98  			break
    99  		}
   100  		log.Debug.Printf("admit.Retry: %v, retries=%d", err, retries)
   101  	}
   102  	return err
   103  }
   104  
   105  const defaultLimitChangeRate = 0.1
   106  
   107  // Adjust changes the limit by factor.
   108  func adjust(limit int, factor float32) int {
   109  	return int(float32(limit) * (1 + factor))
   110  }
   111  
   112  func min(x, y int) int {
   113  	if x < y {
   114  		return x
   115  	}
   116  	return y
   117  }
   118  
   119  func max(x, y int) int {
   120  	if x > y {
   121  		return x
   122  	}
   123  	return y
   124  }
   125  
   126  type controller struct {
   127  	// limit, used are the current limit and current used tokens respectively.
   128  	limit, used int
   129  	// low, high define the range within which the limit can be adjusted.
   130  	low, high         int
   131  	mu                sync.Mutex
   132  	cond              *ctxsync.Cond
   133  	limitVar, usedVar expvar.Int
   134  }
   135  
   136  type controllerWithRetry struct {
   137  	*controller
   138  	retry.Policy
   139  }
   140  
   141  func newController(start, limit int) *controller {
   142  	c := &controller{limit: start, used: 0, low: start, high: limit}
   143  	c.cond = ctxsync.NewCond(&c.mu)
   144  	return c
   145  }
   146  
   147  // Controller returns a Policy which starts with a concurrency limit of 'start'
   148  // and can grow upto a maximum of 'limit' as long as errors aren't observed.
   149  // A controller is not fair: tokens are not granted in FIFO order;
   150  // rather, waiters are picked randomly to be granted new tokens.
   151  func Controller(start, limit int) Policy {
   152  	return newController(start, limit)
   153  }
   154  
   155  // ControllerWithRetry returns a RetryPolicy which starts with a concurrency
   156  // limit of 'start' and can grow upto a maximum of 'limit' if no errors are seen.
   157  // A controller is not fair: tokens are not granted in FIFO order;
   158  // rather, waiters are picked randomly to be granted new tokens.
   159  func ControllerWithRetry(start, limit int, retryPolicy retry.Policy) RetryPolicy {
   160  	return controllerWithRetry{controller: newController(start, limit), Policy: retryPolicy}
   161  }
   162  
   163  // EnableVarExport enables the export of relevant vars useful for debugging/monitoring.
   164  func EnableVarExport(policy Policy, name string) {
   165  	switch c := policy.(type) {
   166  	case *controller:
   167  		admitLimit.Set(name, &c.limitVar)
   168  		admitUsed.Set(name, &c.usedVar)
   169  	case *aimd:
   170  		admitLimit.Set(name, &c.limitVar)
   171  		admitUsed.Set(name, &c.usedVar)
   172  	}
   173  }
   174  
   175  // Acquire acquires a number of tokens from the admission controller.
   176  // Returns on success, or if the context was canceled.
   177  func (c *controller) Acquire(ctx context.Context, need int) error {
   178  	c.mu.Lock()
   179  	defer c.mu.Unlock()
   180  	for {
   181  		// TODO(swami): should allow an increase only when the last release was ok
   182  		lim := min(adjust(c.limit, defaultLimitChangeRate), c.high)
   183  		have := lim - c.used
   184  		if need <= have || (need > lim && c.used == 0) {
   185  			c.used += need
   186  			c.usedVar.Set(int64(c.used))
   187  			return nil
   188  		}
   189  		if err := c.cond.Wait(ctx); err != nil {
   190  			return err
   191  		}
   192  	}
   193  }
   194  
   195  // Release releases a number of tokens to the admission controller,
   196  // reporting whether the request was within the capacity limits.
   197  func (c *controller) Release(tokens int, ok bool) {
   198  	c.mu.Lock()
   199  	defer c.mu.Unlock()
   200  	if ok {
   201  		if c.used > c.limit {
   202  			c.limit = min(c.used, c.high)
   203  		}
   204  	} else {
   205  		c.limit = max(c.low, adjust(c.limit, -defaultLimitChangeRate))
   206  	}
   207  	c.used -= tokens
   208  
   209  	c.limitVar.Set(int64(c.limit))
   210  	c.usedVar.Set(int64(c.used))
   211  	c.cond.Broadcast()
   212  }
   213  
   214  type aimd struct {
   215  	// limit, used are the current limit and current used tokens respectively.
   216  	limit, used int
   217  	// min is the minimum limit.
   218  	min int
   219  	// decfactor is the factor by which tokens are reduced upon congestion.
   220  	decfactor float32
   221  
   222  	mu                sync.Mutex
   223  	cond              *ctxsync.Cond
   224  	limitVar, usedVar expvar.Int
   225  }
   226  
   227  type aimdWithRetry struct {
   228  	*aimd
   229  	retry.Policy
   230  }
   231  
   232  func newAimd(min int, decfactor float32) *aimd {
   233  	c := &aimd{min: min, limit: min, decfactor: decfactor}
   234  	c.cond = ctxsync.NewCond(&c.mu)
   235  	return c
   236  }
   237  
   238  // AIMD returns a Policy which uses the Additive increase/multiplicative decrease
   239  // algorithm for computing the amount of the concurrency to allow.
   240  // AIMD is not fair: tokens are not granted in FIFO order;
   241  // rather, waiters are picked randomly to be granted new tokens.
   242  func AIMD(min int, decfactor float32) Policy {
   243  	return newAimd(min, decfactor)
   244  }
   245  
   246  // AIMDWithRetry returns a RetryPolicy which uses the Additive increase/multiplicative decrease
   247  // algorithm for computing the amount of the concurrency to allow.
   248  // AIMDWithRetry is not fair: tokens are not granted in FIFO order;
   249  // rather, waiters are picked randomly to be granted new tokens.
   250  func AIMDWithRetry(min int, decfactor float32, retryPolicy retry.Policy) RetryPolicy {
   251  	return aimdWithRetry{aimd: newAimd(min, decfactor), Policy: retryPolicy}
   252  }
   253  
   254  // Acquire acquires a number of tokens from the admission controller.
   255  // Returns on success, or if the context was canceled.
   256  func (c *aimd) Acquire(ctx context.Context, need int) error {
   257  	c.mu.Lock()
   258  	defer c.mu.Unlock()
   259  	for {
   260  		have := c.limit - c.used
   261  		if need <= have || (need > c.limit && c.used == 0) {
   262  			c.used += need
   263  			c.usedVar.Set(int64(c.used))
   264  			return nil
   265  		}
   266  		if err := c.cond.Wait(ctx); err != nil {
   267  			return err
   268  		}
   269  	}
   270  }
   271  
   272  // Release releases a number of tokens to the admission controller,
   273  // reporting whether the request was within the capacity limits.
   274  func (c *aimd) Release(tokens int, ok bool) {
   275  	c.mu.Lock()
   276  	defer c.mu.Unlock()
   277  	switch {
   278  	case !ok:
   279  		c.limit = max(c.min, adjust(c.limit, -c.decfactor))
   280  	case ok && c.used == c.limit:
   281  		c.limit += 1
   282  	}
   283  	c.used -= tokens
   284  
   285  	c.limitVar.Set(int64(c.limit))
   286  	c.usedVar.Set(int64(c.used))
   287  	c.cond.Broadcast()
   288  }