github.com/yandex/pandora@v0.5.32/core/engine/engine.go (about) 1 package engine 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 8 "github.com/pkg/errors" 9 "github.com/yandex/pandora/core" 10 "github.com/yandex/pandora/core/coreutil" 11 "github.com/yandex/pandora/core/warmup" 12 "github.com/yandex/pandora/lib/errutil" 13 "github.com/yandex/pandora/lib/monitoring" 14 "go.uber.org/zap" 15 ) 16 17 type Config struct { 18 Pools []InstancePoolConfig `config:"pools" validate:"required,dive"` 19 } 20 21 type InstancePoolConfig struct { 22 ID string 23 Provider core.Provider `config:"ammo" validate:"required"` 24 Aggregator core.Aggregator `config:"result" validate:"required"` 25 NewGun func() (core.Gun, error) `config:"gun" validate:"required"` 26 RPSPerInstance bool `config:"rps-per-instance"` 27 NewRPSSchedule func() (core.Schedule, error) `config:"rps" validate:"required"` 28 StartupSchedule core.Schedule `config:"startup" validate:"required"` 29 DiscardOverflow bool `config:"discard_overflow"` 30 } 31 32 func NewMetrics(prefix string) Metrics { 33 return Metrics{ 34 Request: monitoring.NewCounter(prefix + "_Requests"), 35 Response: monitoring.NewCounter(prefix + "_Responses"), 36 InstanceStart: monitoring.NewCounter(prefix + "_UsersStarted"), 37 InstanceFinish: monitoring.NewCounter(prefix + "_UsersFinished"), 38 BusyInstances: monitoring.NewInstanceTracker(prefix + "_BusyInstances"), 39 } 40 } 41 42 // TODO(skipor): use something github.com/rcrowley/go-metrics based. 43 // Its high level primitives like Meter can be not fast enough, but EWMAs 44 // and Counters should good for that. 45 type Metrics struct { 46 Request *monitoring.Counter 47 Response *monitoring.Counter 48 InstanceStart *monitoring.Counter 49 InstanceFinish *monitoring.Counter 50 BusyInstances *monitoring.InstanceTracker 51 } 52 53 func New(log *zap.Logger, m Metrics, conf Config) *Engine { 54 return &Engine{log: log, config: conf, metrics: m} 55 } 56 57 type Engine struct { 58 log *zap.Logger 59 config Config 60 metrics Metrics 61 wait sync.WaitGroup 62 } 63 64 // Run runs all instance pools. Run blocks until fail happen, or all pools 65 // subroutines are successfully finished. 66 // Ctx will be ancestor to Contexts passed to AmmoQueue, Gun and Aggregator. 67 // That's ctx cancel cancels shooting and it's Context values can be used for communication between plugins. 68 func (e *Engine) Run(ctx context.Context) error { 69 ctx, cancel := context.WithCancel(ctx) 70 defer func() { 71 e.log.Info("Engine finished") 72 cancel() 73 }() 74 75 runRes := make(chan poolRunResult, 1) 76 for i, conf := range e.config.Pools { 77 if conf.ID == "" { 78 conf.ID = fmt.Sprintf("pool_%v", i) 79 } 80 e.wait.Add(1) 81 pool := newPool(e.log, e.metrics, e.wait.Done, conf) 82 go func() { 83 err := pool.Run(ctx) 84 select { 85 case runRes <- poolRunResult{ID: pool.ID, Err: err}: 86 case <-ctx.Done(): 87 pool.log.Info("Pool run result suppressed", 88 zap.String("id", pool.ID), zap.Error(err)) 89 } 90 }() 91 } 92 93 for i := 0; i < len(e.config.Pools); i++ { 94 select { 95 case res := <-runRes: 96 e.log.Debug("Pool awaited", zap.Int("awaited", i), 97 zap.String("id", res.ID), zap.Error(res.Err)) 98 if res.Err != nil { 99 select { 100 case <-ctx.Done(): 101 return ctx.Err() 102 default: 103 } 104 return errors.WithMessage(res.Err, fmt.Sprintf("%q pool run failed", res.ID)) 105 } 106 case <-ctx.Done(): 107 e.log.Info("Engine run canceled") 108 return ctx.Err() 109 } 110 } 111 return nil 112 } 113 114 // Wait blocks until all run engine tasks are finished. 115 // Useful only in case of fail, because successful run awaits all started tasks. 116 func (e *Engine) Wait() { 117 e.wait.Wait() 118 } 119 120 func newPool(log *zap.Logger, m Metrics, onWaitDone func(), conf InstancePoolConfig) *instancePool { 121 log = log.With(zap.String("pool", conf.ID)) 122 return &instancePool{log: log, metrics: m, onWaitDone: onWaitDone, InstancePoolConfig: conf} 123 } 124 125 type instancePool struct { 126 log *zap.Logger 127 metrics Metrics 128 onWaitDone func() 129 InstancePoolConfig 130 sharedGunDeps any 131 } 132 133 // Run start instance pool. Run blocks until fail happen, or all instances finish. 134 // What's going on: 135 // AmmoQueue and Aggregator are started in separate goroutines. 136 // Instances create due to schedule is started in separate goroutine. 137 // Every new instance started in separate goroutine. 138 // When all instances are finished, Aggregator and AmmoQueue contexts are canceled, 139 // and their execution results are awaited. 140 // If error happen or Run context has been canceled, Run returns non-nil error immediately, 141 // remaining results awaiting goroutine in background, that will call onWaitDone callback, 142 // when all started subroutines will be finished. 143 func (p *instancePool) Run(ctx context.Context) error { 144 p.log.Info("Pool run started") 145 ctx, cancel := context.WithCancel(ctx) 146 defer func() { 147 p.log.Info("Pool run finished") 148 cancel() 149 }() 150 151 if err := p.warmUpGun(ctx); err != nil { 152 p.onWaitDone() 153 return err 154 } 155 156 rh, err := p.runAsync(ctx) 157 if err != nil { 158 return err 159 } 160 161 awaitErr := p.awaitRunAsync(rh) 162 163 select { 164 case <-ctx.Done(): 165 p.log.Info("Pool execution canceled") 166 return ctx.Err() 167 case err, ok := <-awaitErr: 168 if ok { 169 p.log.Info("Pool failed. Canceling started tasks", zap.Error(err)) 170 return err 171 } 172 p.log.Info("Pool run finished successfully") 173 return nil 174 } 175 } 176 177 func (p *instancePool) warmUpGun(ctx context.Context) error { 178 gun, err := p.NewGun() 179 if err != nil { 180 return fmt.Errorf("can't initiate a gun: %w", err) 181 } 182 if gunWithWarmUp, ok := gun.(warmup.WarmedUp); ok { 183 p.sharedGunDeps, err = gunWithWarmUp.WarmUp(&warmup.Options{Log: p.log, Ctx: ctx}) 184 if err != nil { 185 return fmt.Errorf("gun warm up failed: %w", err) 186 } 187 } 188 return nil 189 } 190 191 type poolAsyncRunHandle struct { 192 runCtx context.Context 193 runCancel context.CancelFunc 194 instanceStartCtx context.Context 195 instanceStartCancel context.CancelFunc 196 197 providerErr <-chan error 198 aggregatorErr <-chan error 199 startRes <-chan startResult 200 // Read only actually. But can be closed by reader, to be sure, that no result has been lost. 201 runRes chan instanceRunResult 202 } 203 204 func (p *instancePool) runAsync(runCtx context.Context) (*poolAsyncRunHandle, error) { 205 // Canceled in case all instances finish, fail or run runCancel. 206 runCtx, runCancel := context.WithCancel(runCtx) 207 _ = runCancel 208 // Canceled also on out of ammo, and finish of shared RPS schedule. 209 instanceStartCtx, instanceStartCancel := context.WithCancel(runCtx) 210 newInstanceSchedule, err := p.buildNewInstanceSchedule(instanceStartCtx, instanceStartCancel) 211 if err != nil { 212 return nil, err 213 } 214 // Seems good enough. Even if some run will block on result send, it's not real problem. 215 const runResultBufSize = 64 216 var ( 217 // All channels are buffered. All results should be read. 218 providerErr = make(chan error, 1) 219 aggregatorErr = make(chan error, 1) 220 startRes = make(chan startResult, 1) 221 runRes = make(chan instanceRunResult, runResultBufSize) 222 ) 223 go func() { 224 deps := core.ProviderDeps{Log: p.log, PoolID: p.ID} 225 providerErr <- p.Provider.Run(runCtx, deps) 226 }() 227 go func() { 228 deps := core.AggregatorDeps{Log: p.log} 229 aggregatorErr <- p.Aggregator.Run(runCtx, deps) 230 }() 231 go func() { 232 started, err := p.startInstances(instanceStartCtx, runCtx, newInstanceSchedule, runRes) 233 startRes <- startResult{started, err} 234 }() 235 return &poolAsyncRunHandle{ 236 runCtx: runCtx, 237 runCancel: runCancel, 238 instanceStartCtx: instanceStartCtx, 239 instanceStartCancel: instanceStartCancel, 240 providerErr: providerErr, 241 aggregatorErr: aggregatorErr, 242 runRes: runRes, 243 startRes: startRes, 244 }, nil 245 } 246 247 func (p *instancePool) awaitRunAsync(runHandle *poolAsyncRunHandle) <-chan error { 248 ah, awaitErr := p.newAwaitRunHandle(runHandle) 249 go func() { 250 defer func() { 251 ah.log.Debug("Pool wait finished") 252 close(ah.awaitErr) 253 if p.onWaitDone != nil { 254 p.onWaitDone() 255 } 256 }() 257 ah.awaitRun() 258 }() 259 return awaitErr 260 } 261 262 type runAwaitHandle struct { 263 log *zap.Logger 264 poolAsyncRunHandle 265 awaitErr chan<- error 266 toWait int 267 startedInstances int 268 awaitedInstances int 269 } 270 271 func (p *instancePool) newAwaitRunHandle(runHandle *poolAsyncRunHandle) (*runAwaitHandle, <-chan error) { 272 awaitErr := make(chan error) 273 const resultsToWait = 4 // AmmoQueue, Aggregator, instance start, instance run. 274 awaitHandle := &runAwaitHandle{ 275 log: p.log, 276 poolAsyncRunHandle: *runHandle, 277 awaitErr: awaitErr, 278 toWait: resultsToWait, 279 startedInstances: -1, // Undefined until start finish. 280 } 281 return awaitHandle, awaitErr 282 } 283 284 func (ah *runAwaitHandle) awaitRun() { 285 for ah.toWait > 0 { 286 select { 287 case err := <-ah.providerErr: 288 ah.providerErr = nil 289 // TODO(skipor): not wait for provider, to return success result? 290 ah.toWait-- 291 ah.log.Debug("AmmoQueue awaited", zap.Error(err)) 292 if !errutil.IsCtxError(ah.runCtx, err) { 293 ah.onErrAwaited(errors.WithMessage(err, "provider failed")) 294 } 295 case err := <-ah.aggregatorErr: 296 ah.aggregatorErr = nil 297 ah.toWait-- 298 ah.log.Debug("Aggregator awaited", zap.Error(err)) 299 if !errutil.IsCtxError(ah.runCtx, err) { 300 ah.onErrAwaited(errors.WithMessage(err, "aggregator failed")) 301 } 302 case res := <-ah.startRes: 303 ah.startRes = nil 304 ah.toWait-- 305 ah.startedInstances = res.Started 306 ah.log.Debug("Instances start awaited", zap.Int("started", ah.startedInstances), zap.Error(res.Err)) 307 if !errutil.IsCtxError(ah.instanceStartCtx, res.Err) { 308 ah.onErrAwaited(errors.WithMessage(res.Err, "instances start failed")) 309 } 310 ah.checkAllInstancesAreFinished() // There is a race between run and start results. 311 case res := <-ah.runRes: 312 ah.awaitedInstances++ 313 if ent := ah.log.Check(zap.DebugLevel, "Instance run awaited"); ent != nil { 314 ent.Write(zap.Int("id", res.ID), zap.Int("awaited", ah.awaitedInstances), zap.Error(res.Err)) 315 } 316 317 if res.Err == outOfAmmoErr { 318 if !ah.isStartFinished() { 319 ah.log.Debug("Canceling instance start because out of ammo") 320 ah.instanceStartCancel() 321 } 322 } else if !errutil.IsCtxError(ah.runCtx, res.Err) { 323 ah.onErrAwaited(errors.WithMessage(res.Err, fmt.Sprintf("instance %q run failed", res.ID))) 324 } 325 ah.checkAllInstancesAreFinished() 326 } 327 } 328 } 329 330 func (ah *runAwaitHandle) onErrAwaited(err error) { 331 select { 332 case ah.awaitErr <- err: 333 case <-ah.runCtx.Done(): 334 if err != ah.runCtx.Err() { 335 ah.log.Debug("Error suppressed after run cancel", zap.Error(err)) 336 } 337 } 338 } 339 340 func (ah *runAwaitHandle) checkAllInstancesAreFinished() { 341 allFinished := ah.isStartFinished() && ah.awaitedInstances >= ah.startedInstances 342 if !allFinished { 343 return 344 } 345 // Assert, that all run results are awaited. 346 close(ah.runRes) 347 res, ok := <-ah.runRes 348 if ok { 349 ah.log.Panic("Unexpected run result", zap.Any("res", res)) 350 } 351 352 ah.runRes = nil 353 ah.toWait-- 354 ah.log.Info("All instances runs awaited.", zap.Int("awaited", ah.awaitedInstances)) 355 ah.runCancel() // Signal to provider and aggregator, that pool run is finished. 356 357 } 358 359 func (ah *runAwaitHandle) isStartFinished() bool { 360 return ah.startRes == nil 361 } 362 363 func (p *instancePool) startInstances( 364 startCtx, runCtx context.Context, 365 newInstanceSchedule func() (core.Schedule, error), 366 runRes chan<- instanceRunResult) (started int, err error) { 367 deps := instanceDeps{ 368 newSchedule: newInstanceSchedule, 369 newGun: p.NewGun, 370 instanceSharedDeps: instanceSharedDeps{ 371 provider: p.Provider, 372 metrics: p.metrics, 373 gunDeps: p.sharedGunDeps, 374 aggregator: p.Aggregator, 375 discardOverflow: p.DiscardOverflow, 376 }, 377 } 378 379 waiter := coreutil.NewWaiter(p.StartupSchedule) 380 381 // If create all instances asynchronously, and creation will fail, too many errors appears in log. 382 ok := waiter.Wait(startCtx) 383 if !ok { 384 err = startCtx.Err() 385 return 386 } 387 firstInstance, err := newInstance(runCtx, p.log, p.ID, 0, deps) 388 if err != nil { 389 return 390 } 391 started++ 392 go func() { 393 runRes <- instanceRunResult{0, func() error { 394 defer firstInstance.Close() 395 return firstInstance.Run(runCtx) 396 }()} 397 }() 398 399 for ; waiter.Wait(startCtx); started++ { 400 id := started 401 go func() { 402 runRes <- instanceRunResult{id, runNewInstance(runCtx, p.log, p.ID, id, deps)} 403 }() 404 } 405 err = startCtx.Err() 406 return 407 } 408 409 func (p *instancePool) buildNewInstanceSchedule(startCtx context.Context, cancelStart context.CancelFunc) ( 410 func() (core.Schedule, error), error, 411 ) { 412 if p.RPSPerInstance { 413 return p.NewRPSSchedule, nil 414 } 415 sharedRPSSchedule, err := p.NewRPSSchedule() 416 if err != nil { 417 return nil, err 418 } 419 sharedRPSSchedule = coreutil.NewCallbackOnFinishSchedule(sharedRPSSchedule, func() { 420 select { 421 case <-startCtx.Done(): 422 p.log.Debug("RPS schedule has been finished") 423 return 424 default: 425 p.log.Info("RPS schedule has been finished. Canceling instance start.") 426 cancelStart() 427 } 428 }) 429 return func() (core.Schedule, error) { 430 return sharedRPSSchedule, err 431 }, nil 432 } 433 434 func runNewInstance(ctx context.Context, log *zap.Logger, poolID string, id int, deps instanceDeps) error { 435 instance, err := newInstance(ctx, log, poolID, id, deps) 436 if err != nil { 437 return err 438 } 439 defer instance.Close() 440 return instance.Run(ctx) 441 } 442 443 type poolRunResult struct { 444 ID string 445 Err error 446 } 447 448 type instanceRunResult struct { 449 ID int 450 Err error 451 } 452 453 type startResult struct { 454 Started int 455 Err error 456 }