github.com/yandex/pandora@v0.5.32/core/engine/engine.go

github.com/yandex/pandora@v0.5.32/core/engine/engine.go (about)

     1  package engine
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  
     8  	"github.com/pkg/errors"
     9  	"github.com/yandex/pandora/core"
    10  	"github.com/yandex/pandora/core/coreutil"
    11  	"github.com/yandex/pandora/core/warmup"
    12  	"github.com/yandex/pandora/lib/errutil"
    13  	"github.com/yandex/pandora/lib/monitoring"
    14  	"go.uber.org/zap"
    15  )
    16  
    17  type Config struct {
    18  	Pools []InstancePoolConfig `config:"pools" validate:"required,dive"`
    19  }
    20  
    21  type InstancePoolConfig struct {
    22  	ID              string
    23  	Provider        core.Provider                 `config:"ammo" validate:"required"`
    24  	Aggregator      core.Aggregator               `config:"result" validate:"required"`
    25  	NewGun          func() (core.Gun, error)      `config:"gun" validate:"required"`
    26  	RPSPerInstance  bool                          `config:"rps-per-instance"`
    27  	NewRPSSchedule  func() (core.Schedule, error) `config:"rps" validate:"required"`
    28  	StartupSchedule core.Schedule                 `config:"startup" validate:"required"`
    29  	DiscardOverflow bool                          `config:"discard_overflow"`
    30  }
    31  
    32  func NewMetrics(prefix string) Metrics {
    33  	return Metrics{
    34  		Request:        monitoring.NewCounter(prefix + "_Requests"),
    35  		Response:       monitoring.NewCounter(prefix + "_Responses"),
    36  		InstanceStart:  monitoring.NewCounter(prefix + "_UsersStarted"),
    37  		InstanceFinish: monitoring.NewCounter(prefix + "_UsersFinished"),
    38  		BusyInstances:  monitoring.NewInstanceTracker(prefix + "_BusyInstances"),
    39  	}
    40  }
    41  
    42  // TODO(skipor): use something github.com/rcrowley/go-metrics based.
    43  // Its high level primitives like Meter can be not fast enough, but EWMAs
    44  // and Counters should good for that.
    45  type Metrics struct {
    46  	Request        *monitoring.Counter
    47  	Response       *monitoring.Counter
    48  	InstanceStart  *monitoring.Counter
    49  	InstanceFinish *monitoring.Counter
    50  	BusyInstances  *monitoring.InstanceTracker
    51  }
    52  
    53  func New(log *zap.Logger, m Metrics, conf Config) *Engine {
    54  	return &Engine{log: log, config: conf, metrics: m}
    55  }
    56  
    57  type Engine struct {
    58  	log     *zap.Logger
    59  	config  Config
    60  	metrics Metrics
    61  	wait    sync.WaitGroup
    62  }
    63  
    64  // Run runs all instance pools. Run blocks until fail happen, or all pools
    65  // subroutines are successfully finished.
    66  // Ctx will be ancestor to Contexts passed to AmmoQueue, Gun and Aggregator.
    67  // That's ctx cancel cancels shooting and it's Context values can be used for communication between plugins.
    68  func (e *Engine) Run(ctx context.Context) error {
    69  	ctx, cancel := context.WithCancel(ctx)
    70  	defer func() {
    71  		e.log.Info("Engine finished")
    72  		cancel()
    73  	}()
    74  
    75  	runRes := make(chan poolRunResult, 1)
    76  	for i, conf := range e.config.Pools {
    77  		if conf.ID == "" {
    78  			conf.ID = fmt.Sprintf("pool_%v", i)
    79  		}
    80  		e.wait.Add(1)
    81  		pool := newPool(e.log, e.metrics, e.wait.Done, conf)
    82  		go func() {
    83  			err := pool.Run(ctx)
    84  			select {
    85  			case runRes <- poolRunResult{ID: pool.ID, Err: err}:
    86  			case <-ctx.Done():
    87  				pool.log.Info("Pool run result suppressed",
    88  					zap.String("id", pool.ID), zap.Error(err))
    89  			}
    90  		}()
    91  	}
    92  
    93  	for i := 0; i < len(e.config.Pools); i++ {
    94  		select {
    95  		case res := <-runRes:
    96  			e.log.Debug("Pool awaited", zap.Int("awaited", i),
    97  				zap.String("id", res.ID), zap.Error(res.Err))
    98  			if res.Err != nil {
    99  				select {
   100  				case <-ctx.Done():
   101  					return ctx.Err()
   102  				default:
   103  				}
   104  				return errors.WithMessage(res.Err, fmt.Sprintf("%q pool run failed", res.ID))
   105  			}
   106  		case <-ctx.Done():
   107  			e.log.Info("Engine run canceled")
   108  			return ctx.Err()
   109  		}
   110  	}
   111  	return nil
   112  }
   113  
   114  // Wait blocks until all run engine tasks are finished.
   115  // Useful only in case of fail, because successful run awaits all started tasks.
   116  func (e *Engine) Wait() {
   117  	e.wait.Wait()
   118  }
   119  
   120  func newPool(log *zap.Logger, m Metrics, onWaitDone func(), conf InstancePoolConfig) *instancePool {
   121  	log = log.With(zap.String("pool", conf.ID))
   122  	return &instancePool{log: log, metrics: m, onWaitDone: onWaitDone, InstancePoolConfig: conf}
   123  }
   124  
   125  type instancePool struct {
   126  	log        *zap.Logger
   127  	metrics    Metrics
   128  	onWaitDone func()
   129  	InstancePoolConfig
   130  	sharedGunDeps any
   131  }
   132  
   133  // Run start instance pool. Run blocks until fail happen, or all instances finish.
   134  // What's going on:
   135  // AmmoQueue and Aggregator are started in separate goroutines.
   136  // Instances create due to schedule is started in separate goroutine.
   137  // Every new instance started in separate goroutine.
   138  // When all instances are finished, Aggregator and AmmoQueue contexts are canceled,
   139  // and their execution results are awaited.
   140  // If error happen or Run context has been canceled, Run returns non-nil error immediately,
   141  // remaining results awaiting goroutine in background, that will call onWaitDone callback,
   142  // when all started subroutines will be finished.
   143  func (p *instancePool) Run(ctx context.Context) error {
   144  	p.log.Info("Pool run started")
   145  	ctx, cancel := context.WithCancel(ctx)
   146  	defer func() {
   147  		p.log.Info("Pool run finished")
   148  		cancel()
   149  	}()
   150  
   151  	if err := p.warmUpGun(ctx); err != nil {
   152  		p.onWaitDone()
   153  		return err
   154  	}
   155  
   156  	rh, err := p.runAsync(ctx)
   157  	if err != nil {
   158  		return err
   159  	}
   160  
   161  	awaitErr := p.awaitRunAsync(rh)
   162  
   163  	select {
   164  	case <-ctx.Done():
   165  		p.log.Info("Pool execution canceled")
   166  		return ctx.Err()
   167  	case err, ok := <-awaitErr:
   168  		if ok {
   169  			p.log.Info("Pool failed. Canceling started tasks", zap.Error(err))
   170  			return err
   171  		}
   172  		p.log.Info("Pool run finished successfully")
   173  		return nil
   174  	}
   175  }
   176  
   177  func (p *instancePool) warmUpGun(ctx context.Context) error {
   178  	gun, err := p.NewGun()
   179  	if err != nil {
   180  		return fmt.Errorf("can't initiate a gun: %w", err)
   181  	}
   182  	if gunWithWarmUp, ok := gun.(warmup.WarmedUp); ok {
   183  		p.sharedGunDeps, err = gunWithWarmUp.WarmUp(&warmup.Options{Log: p.log, Ctx: ctx})
   184  		if err != nil {
   185  			return fmt.Errorf("gun warm up failed: %w", err)
   186  		}
   187  	}
   188  	return nil
   189  }
   190  
   191  type poolAsyncRunHandle struct {
   192  	runCtx              context.Context
   193  	runCancel           context.CancelFunc
   194  	instanceStartCtx    context.Context
   195  	instanceStartCancel context.CancelFunc
   196  
   197  	providerErr   <-chan error
   198  	aggregatorErr <-chan error
   199  	startRes      <-chan startResult
   200  	// Read only actually. But can be closed by reader, to be sure, that no result has been lost.
   201  	runRes chan instanceRunResult
   202  }
   203  
   204  func (p *instancePool) runAsync(runCtx context.Context) (*poolAsyncRunHandle, error) {
   205  	// Canceled in case all instances finish, fail or run runCancel.
   206  	runCtx, runCancel := context.WithCancel(runCtx)
   207  	_ = runCancel
   208  	// Canceled also on out of ammo, and finish of shared RPS schedule.
   209  	instanceStartCtx, instanceStartCancel := context.WithCancel(runCtx)
   210  	newInstanceSchedule, err := p.buildNewInstanceSchedule(instanceStartCtx, instanceStartCancel)
   211  	if err != nil {
   212  		return nil, err
   213  	}
   214  	// Seems good enough. Even if some run will block on result send, it's not real problem.
   215  	const runResultBufSize = 64
   216  	var (
   217  		// All channels are buffered. All results should be read.
   218  		providerErr   = make(chan error, 1)
   219  		aggregatorErr = make(chan error, 1)
   220  		startRes      = make(chan startResult, 1)
   221  		runRes        = make(chan instanceRunResult, runResultBufSize)
   222  	)
   223  	go func() {
   224  		deps := core.ProviderDeps{Log: p.log, PoolID: p.ID}
   225  		providerErr <- p.Provider.Run(runCtx, deps)
   226  	}()
   227  	go func() {
   228  		deps := core.AggregatorDeps{Log: p.log}
   229  		aggregatorErr <- p.Aggregator.Run(runCtx, deps)
   230  	}()
   231  	go func() {
   232  		started, err := p.startInstances(instanceStartCtx, runCtx, newInstanceSchedule, runRes)
   233  		startRes <- startResult{started, err}
   234  	}()
   235  	return &poolAsyncRunHandle{
   236  		runCtx:              runCtx,
   237  		runCancel:           runCancel,
   238  		instanceStartCtx:    instanceStartCtx,
   239  		instanceStartCancel: instanceStartCancel,
   240  		providerErr:         providerErr,
   241  		aggregatorErr:       aggregatorErr,
   242  		runRes:              runRes,
   243  		startRes:            startRes,
   244  	}, nil
   245  }
   246  
   247  func (p *instancePool) awaitRunAsync(runHandle *poolAsyncRunHandle) <-chan error {
   248  	ah, awaitErr := p.newAwaitRunHandle(runHandle)
   249  	go func() {
   250  		defer func() {
   251  			ah.log.Debug("Pool wait finished")
   252  			close(ah.awaitErr)
   253  			if p.onWaitDone != nil {
   254  				p.onWaitDone()
   255  			}
   256  		}()
   257  		ah.awaitRun()
   258  	}()
   259  	return awaitErr
   260  }
   261  
   262  type runAwaitHandle struct {
   263  	log *zap.Logger
   264  	poolAsyncRunHandle
   265  	awaitErr         chan<- error
   266  	toWait           int
   267  	startedInstances int
   268  	awaitedInstances int
   269  }
   270  
   271  func (p *instancePool) newAwaitRunHandle(runHandle *poolAsyncRunHandle) (*runAwaitHandle, <-chan error) {
   272  	awaitErr := make(chan error)
   273  	const resultsToWait = 4 // AmmoQueue, Aggregator, instance start, instance run.
   274  	awaitHandle := &runAwaitHandle{
   275  		log:                p.log,
   276  		poolAsyncRunHandle: *runHandle,
   277  		awaitErr:           awaitErr,
   278  		toWait:             resultsToWait,
   279  		startedInstances:   -1, // Undefined until start finish.
   280  	}
   281  	return awaitHandle, awaitErr
   282  }
   283  
   284  func (ah *runAwaitHandle) awaitRun() {
   285  	for ah.toWait > 0 {
   286  		select {
   287  		case err := <-ah.providerErr:
   288  			ah.providerErr = nil
   289  			// TODO(skipor): not wait for provider, to return success result?
   290  			ah.toWait--
   291  			ah.log.Debug("AmmoQueue awaited", zap.Error(err))
   292  			if !errutil.IsCtxError(ah.runCtx, err) {
   293  				ah.onErrAwaited(errors.WithMessage(err, "provider failed"))
   294  			}
   295  		case err := <-ah.aggregatorErr:
   296  			ah.aggregatorErr = nil
   297  			ah.toWait--
   298  			ah.log.Debug("Aggregator awaited", zap.Error(err))
   299  			if !errutil.IsCtxError(ah.runCtx, err) {
   300  				ah.onErrAwaited(errors.WithMessage(err, "aggregator failed"))
   301  			}
   302  		case res := <-ah.startRes:
   303  			ah.startRes = nil
   304  			ah.toWait--
   305  			ah.startedInstances = res.Started
   306  			ah.log.Debug("Instances start awaited", zap.Int("started", ah.startedInstances), zap.Error(res.Err))
   307  			if !errutil.IsCtxError(ah.instanceStartCtx, res.Err) {
   308  				ah.onErrAwaited(errors.WithMessage(res.Err, "instances start failed"))
   309  			}
   310  			ah.checkAllInstancesAreFinished() // There is a race between run and start results.
   311  		case res := <-ah.runRes:
   312  			ah.awaitedInstances++
   313  			if ent := ah.log.Check(zap.DebugLevel, "Instance run awaited"); ent != nil {
   314  				ent.Write(zap.Int("id", res.ID), zap.Int("awaited", ah.awaitedInstances), zap.Error(res.Err))
   315  			}
   316  
   317  			if res.Err == outOfAmmoErr {
   318  				if !ah.isStartFinished() {
   319  					ah.log.Debug("Canceling instance start because out of ammo")
   320  					ah.instanceStartCancel()
   321  				}
   322  			} else if !errutil.IsCtxError(ah.runCtx, res.Err) {
   323  				ah.onErrAwaited(errors.WithMessage(res.Err, fmt.Sprintf("instance %q run failed", res.ID)))
   324  			}
   325  			ah.checkAllInstancesAreFinished()
   326  		}
   327  	}
   328  }
   329  
   330  func (ah *runAwaitHandle) onErrAwaited(err error) {
   331  	select {
   332  	case ah.awaitErr <- err:
   333  	case <-ah.runCtx.Done():
   334  		if err != ah.runCtx.Err() {
   335  			ah.log.Debug("Error suppressed after run cancel", zap.Error(err))
   336  		}
   337  	}
   338  }
   339  
   340  func (ah *runAwaitHandle) checkAllInstancesAreFinished() {
   341  	allFinished := ah.isStartFinished() && ah.awaitedInstances >= ah.startedInstances
   342  	if !allFinished {
   343  		return
   344  	}
   345  	// Assert, that all run results are awaited.
   346  	close(ah.runRes)
   347  	res, ok := <-ah.runRes
   348  	if ok {
   349  		ah.log.Panic("Unexpected run result", zap.Any("res", res))
   350  	}
   351  
   352  	ah.runRes = nil
   353  	ah.toWait--
   354  	ah.log.Info("All instances runs awaited.", zap.Int("awaited", ah.awaitedInstances))
   355  	ah.runCancel() // Signal to provider and aggregator, that pool run is finished.
   356  
   357  }
   358  
   359  func (ah *runAwaitHandle) isStartFinished() bool {
   360  	return ah.startRes == nil
   361  }
   362  
   363  func (p *instancePool) startInstances(
   364  	startCtx, runCtx context.Context,
   365  	newInstanceSchedule func() (core.Schedule, error),
   366  	runRes chan<- instanceRunResult) (started int, err error) {
   367  	deps := instanceDeps{
   368  		newSchedule: newInstanceSchedule,
   369  		newGun:      p.NewGun,
   370  		instanceSharedDeps: instanceSharedDeps{
   371  			provider:        p.Provider,
   372  			metrics:         p.metrics,
   373  			gunDeps:         p.sharedGunDeps,
   374  			aggregator:      p.Aggregator,
   375  			discardOverflow: p.DiscardOverflow,
   376  		},
   377  	}
   378  
   379  	waiter := coreutil.NewWaiter(p.StartupSchedule)
   380  
   381  	// If create all instances asynchronously, and creation will fail, too many errors appears in log.
   382  	ok := waiter.Wait(startCtx)
   383  	if !ok {
   384  		err = startCtx.Err()
   385  		return
   386  	}
   387  	firstInstance, err := newInstance(runCtx, p.log, p.ID, 0, deps)
   388  	if err != nil {
   389  		return
   390  	}
   391  	started++
   392  	go func() {
   393  		runRes <- instanceRunResult{0, func() error {
   394  			defer firstInstance.Close()
   395  			return firstInstance.Run(runCtx)
   396  		}()}
   397  	}()
   398  
   399  	for ; waiter.Wait(startCtx); started++ {
   400  		id := started
   401  		go func() {
   402  			runRes <- instanceRunResult{id, runNewInstance(runCtx, p.log, p.ID, id, deps)}
   403  		}()
   404  	}
   405  	err = startCtx.Err()
   406  	return
   407  }
   408  
   409  func (p *instancePool) buildNewInstanceSchedule(startCtx context.Context, cancelStart context.CancelFunc) (
   410  	func() (core.Schedule, error), error,
   411  ) {
   412  	if p.RPSPerInstance {
   413  		return p.NewRPSSchedule, nil
   414  	}
   415  	sharedRPSSchedule, err := p.NewRPSSchedule()
   416  	if err != nil {
   417  		return nil, err
   418  	}
   419  	sharedRPSSchedule = coreutil.NewCallbackOnFinishSchedule(sharedRPSSchedule, func() {
   420  		select {
   421  		case <-startCtx.Done():
   422  			p.log.Debug("RPS schedule has been finished")
   423  			return
   424  		default:
   425  			p.log.Info("RPS schedule has been finished. Canceling instance start.")
   426  			cancelStart()
   427  		}
   428  	})
   429  	return func() (core.Schedule, error) {
   430  		return sharedRPSSchedule, err
   431  	}, nil
   432  }
   433  
   434  func runNewInstance(ctx context.Context, log *zap.Logger, poolID string, id int, deps instanceDeps) error {
   435  	instance, err := newInstance(ctx, log, poolID, id, deps)
   436  	if err != nil {
   437  		return err
   438  	}
   439  	defer instance.Close()
   440  	return instance.Run(ctx)
   441  }
   442  
   443  type poolRunResult struct {
   444  	ID  string
   445  	Err error
   446  }
   447  
   448  type instanceRunResult struct {
   449  	ID  int
   450  	Err error
   451  }
   452  
   453  type startResult struct {
   454  	Started int
   455  	Err     error
   456  }