go.uber.org/cadence@v1.2.9/internal/internal_worker_base.go (about) 1 // Copyright (c) 2017-2020 Uber Technologies Inc. 2 // Portions of the Software are attributed to Copyright (c) 2020 Temporal Technologies Inc. 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 package internal 23 24 // All code in this file is private to the package. 25 26 import ( 27 "context" 28 "errors" 29 "fmt" 30 "os" 31 "runtime" 32 "sync" 33 "syscall" 34 "time" 35 36 "github.com/shirou/gopsutil/cpu" 37 "github.com/uber-go/tally" 38 "go.uber.org/zap" 39 "go.uber.org/zap/zapcore" 40 "golang.org/x/time/rate" 41 42 "go.uber.org/cadence/.gen/go/shared" 43 "go.uber.org/cadence/internal/common/backoff" 44 "go.uber.org/cadence/internal/common/metrics" 45 "go.uber.org/cadence/internal/common/util" 46 ) 47 48 const ( 49 retryPollOperationInitialInterval = 20 * time.Millisecond 50 retryPollOperationMaxInterval = 10 * time.Second 51 hardwareMetricsCollectInterval = 30 * time.Second 52 ) 53 54 var ( 55 pollOperationRetryPolicy = createPollRetryPolicy() 56 ) 57 58 var errShutdown = errors.New("worker shutting down") 59 60 var collectHardwareUsageOnce sync.Once 61 62 type ( 63 // resultHandler that returns result 64 resultHandler func(result []byte, err error) 65 laResultHandler func(lar *localActivityResultWrapper) 66 67 localActivityResultWrapper struct { 68 err error // internal error type, possibly containing encoded user-error data 69 result []byte 70 attempt int32 71 backoff time.Duration 72 } 73 74 // workflowEnvironment Represents the environment for workflow/decider. 75 // Should only be used within the scope of workflow definition 76 workflowEnvironment interface { 77 asyncActivityClient 78 localActivityClient 79 workflowTimerClient 80 SideEffect(f func() ([]byte, error), callback resultHandler) 81 GetVersion(changeID string, minSupported, maxSupported Version) Version 82 WorkflowInfo() *WorkflowInfo 83 Complete(result []byte, err error) 84 RegisterCancelHandler(handler func()) 85 RequestCancelChildWorkflow(domainName, workflowID string) 86 RequestCancelExternalWorkflow(domainName, workflowID, runID string, callback resultHandler) 87 ExecuteChildWorkflow(params executeWorkflowParams, callback resultHandler, startedHandler func(r WorkflowExecution, e error)) error 88 GetLogger() *zap.Logger 89 GetMetricsScope() tally.Scope 90 RegisterSignalHandler(handler func(name string, input []byte)) 91 SignalExternalWorkflow(domainName, workflowID, runID, signalName string, input []byte, arg interface{}, childWorkflowOnly bool, callback resultHandler) 92 RegisterQueryHandler(handler func(queryType string, queryArgs []byte) ([]byte, error)) 93 IsReplaying() bool 94 MutableSideEffect(id string, f func() interface{}, equals func(a, b interface{}) bool) Value 95 GetDataConverter() DataConverter 96 AddSession(sessionInfo *SessionInfo) 97 RemoveSession(sessionID string) 98 GetContextPropagators() []ContextPropagator 99 UpsertSearchAttributes(attributes map[string]interface{}) error 100 GetRegistry() *registry 101 GetWorkflowInterceptors() []WorkflowInterceptorFactory 102 } 103 104 // WorkflowDefinition wraps the code that can execute a workflow. 105 workflowDefinition interface { 106 Execute(env workflowEnvironment, header *shared.Header, input []byte) 107 // Called for each non timed out startDecision event. 108 // Executed after all history events since the previous decision are applied to workflowDefinition 109 OnDecisionTaskStarted() 110 StackTrace() string // Stack trace of all coroutines owned by the Dispatcher instance 111 112 // KnownQueryTypes returns a list of known query types of the workflowOptions with BuiltinQueryTypes 113 KnownQueryTypes() []string 114 Close() 115 } 116 117 // baseWorkerOptions options to configure base worker. 118 baseWorkerOptions struct { 119 pollerAutoScaler pollerAutoScalerOptions 120 pollerCount int 121 pollerRate int 122 maxConcurrentTask int 123 maxTaskPerSecond float64 124 taskWorker taskPoller 125 identity string 126 workerType string 127 shutdownTimeout time.Duration 128 userContextCancel context.CancelFunc 129 host string 130 } 131 132 // baseWorker that wraps worker activities. 133 baseWorker struct { 134 options baseWorkerOptions 135 isWorkerStarted bool 136 shutdownCh chan struct{} // Channel used to shut down the go routines. 137 shutdownWG sync.WaitGroup // The WaitGroup for shutting down existing routines. 138 pollLimiter *rate.Limiter 139 taskLimiter *rate.Limiter 140 limiterContext context.Context 141 limiterContextCancel func() 142 retrier *backoff.ConcurrentRetrier // Service errors back off retrier 143 logger *zap.Logger 144 metricsScope tally.Scope 145 146 pollerRequestCh chan struct{} 147 pollerAutoScaler *pollerAutoScaler 148 taskQueueCh chan interface{} 149 sessionTokenBucket *sessionTokenBucket 150 } 151 152 polledTask struct { 153 task interface{} 154 } 155 ) 156 157 func createPollRetryPolicy() backoff.RetryPolicy { 158 policy := backoff.NewExponentialRetryPolicy(retryPollOperationInitialInterval) 159 policy.SetMaximumInterval(retryPollOperationMaxInterval) 160 161 // NOTE: We don't use expiration interval since we don't use retries from retrier class. 162 // We use it to calculate next backoff. We have additional layer that is built on poller 163 // in the worker layer for to add some middleware for any poll retry that includes 164 // (a) rate limiting across pollers (b) back-off across pollers when server is busy 165 policy.SetExpirationInterval(backoff.NoInterval) // We don't ever expire 166 return policy 167 } 168 169 func newBaseWorker(options baseWorkerOptions, logger *zap.Logger, metricsScope tally.Scope, sessionTokenBucket *sessionTokenBucket) *baseWorker { 170 ctx, cancel := context.WithCancel(context.Background()) 171 172 var pollerAS *pollerAutoScaler 173 if pollerOptions := options.pollerAutoScaler; pollerOptions.Enabled { 174 pollerAS = newPollerScaler( 175 pollerOptions, 176 logger, 177 ) 178 } 179 180 bw := &baseWorker{ 181 options: options, 182 shutdownCh: make(chan struct{}), 183 taskLimiter: rate.NewLimiter(rate.Limit(options.maxTaskPerSecond), 1), 184 retrier: backoff.NewConcurrentRetrier(pollOperationRetryPolicy), 185 logger: logger.With(zapcore.Field{Key: tagWorkerType, Type: zapcore.StringType, String: options.workerType}), 186 metricsScope: tagScope(metricsScope, tagWorkerType, options.workerType), 187 pollerRequestCh: make(chan struct{}, options.maxConcurrentTask), 188 pollerAutoScaler: pollerAS, 189 taskQueueCh: make(chan interface{}), // no buffer, so poller only able to poll new task after previous is dispatched. 190 191 limiterContext: ctx, 192 limiterContextCancel: cancel, 193 sessionTokenBucket: sessionTokenBucket, 194 } 195 if options.pollerRate > 0 { 196 bw.pollLimiter = rate.NewLimiter(rate.Limit(options.pollerRate), 1) 197 } 198 return bw 199 } 200 201 // Start starts a fixed set of routines to do the work. 202 func (bw *baseWorker) Start() { 203 if bw.isWorkerStarted { 204 return 205 } 206 207 bw.metricsScope.Counter(metrics.WorkerStartCounter).Inc(1) 208 209 if bw.pollerAutoScaler != nil { 210 bw.pollerAutoScaler.Start() 211 } 212 213 for i := 0; i < bw.options.pollerCount; i++ { 214 bw.shutdownWG.Add(1) 215 go bw.runPoller() 216 } 217 218 bw.shutdownWG.Add(1) 219 go bw.runTaskDispatcher() 220 221 // We want the emit function run once per host instead of run once per worker 222 // since the emit function is host level metric. 223 bw.shutdownWG.Add(1) 224 go bw.emitHardwareUsage() 225 226 bw.isWorkerStarted = true 227 traceLog(func() { 228 bw.logger.Info("Started Worker", 229 zap.Int("PollerCount", bw.options.pollerCount), 230 zap.Int("MaxConcurrentTask", bw.options.maxConcurrentTask), 231 zap.Float64("MaxTaskPerSecond", bw.options.maxTaskPerSecond), 232 ) 233 }) 234 } 235 236 func (bw *baseWorker) isShutdown() bool { 237 select { 238 case <-bw.shutdownCh: 239 return true 240 default: 241 return false 242 } 243 } 244 245 func (bw *baseWorker) runPoller() { 246 defer bw.shutdownWG.Done() 247 bw.metricsScope.Counter(metrics.PollerStartCounter).Inc(1) 248 249 for { 250 select { 251 case <-bw.shutdownCh: 252 return 253 case <-bw.pollerRequestCh: 254 bw.metricsScope.Gauge(metrics.ConcurrentTaskQuota).Update(float64(cap(bw.pollerRequestCh))) 255 // This metric is used to monitor how many poll requests have been allocated 256 // and can be used to approximate number of concurrent task running (not pinpoint accurate) 257 bw.metricsScope.Gauge(metrics.PollerRequestBufferUsage).Update(float64(cap(bw.pollerRequestCh) - len(bw.pollerRequestCh))) 258 if bw.sessionTokenBucket != nil { 259 bw.sessionTokenBucket.waitForAvailableToken() 260 } 261 bw.pollTask() 262 } 263 } 264 } 265 266 func (bw *baseWorker) runTaskDispatcher() { 267 defer bw.shutdownWG.Done() 268 269 for i := 0; i < bw.options.maxConcurrentTask; i++ { 270 bw.pollerRequestCh <- struct{}{} 271 } 272 273 for { 274 // wait for new task or shutdown 275 select { 276 case <-bw.shutdownCh: 277 return 278 case task := <-bw.taskQueueCh: 279 // for non-polled-task (local activity result as task), we don't need to rate limit 280 _, isPolledTask := task.(*polledTask) 281 if isPolledTask && bw.taskLimiter.Wait(bw.limiterContext) != nil { 282 if bw.isShutdown() { 283 return 284 } 285 } 286 bw.shutdownWG.Add(1) 287 go bw.processTask(task) 288 } 289 } 290 } 291 292 /* 293 There are three types of constraint on polling tasks: 294 1. poller auto scaler is to constraint number of concurrent pollers 295 2. retrier is a backoff constraint on errors 296 3. limiter is a per-second constraint 297 */ 298 func (bw *baseWorker) pollTask() { 299 var err error 300 var task interface{} 301 302 if bw.pollerAutoScaler != nil { 303 if pErr := bw.pollerAutoScaler.Acquire(1); pErr == nil { 304 defer bw.pollerAutoScaler.Release(1) 305 } else { 306 bw.logger.Warn("poller auto scaler acquire error", zap.Error(pErr)) 307 } 308 } 309 310 bw.retrier.Throttle() 311 if bw.pollLimiter == nil || bw.pollLimiter.Wait(bw.limiterContext) == nil { 312 task, err = bw.options.taskWorker.PollTask() 313 if err != nil && enableVerboseLogging { 314 bw.logger.Debug("Failed to poll for task.", zap.Error(err)) 315 } 316 if err != nil { 317 if isNonRetriableError(err) { 318 bw.logger.Error("Worker received non-retriable error. Shutting down.", zap.Error(err)) 319 p, _ := os.FindProcess(os.Getpid()) 320 p.Signal(syscall.SIGINT) 321 return 322 } 323 bw.retrier.Failed() 324 } else { 325 if bw.pollerAutoScaler != nil { 326 if pErr := bw.pollerAutoScaler.CollectUsage(task); pErr != nil { 327 bw.logger.Sugar().Warnw("poller auto scaler collect usage error", 328 "error", pErr, 329 "task", task) 330 } 331 } 332 bw.retrier.Succeeded() 333 } 334 } 335 336 if task != nil { 337 select { 338 case bw.taskQueueCh <- &polledTask{task}: 339 case <-bw.shutdownCh: 340 } 341 } else { 342 bw.pollerRequestCh <- struct{}{} // poll failed, trigger a new poll 343 } 344 } 345 346 func isNonRetriableError(err error) bool { 347 if err == nil { 348 return false 349 } 350 switch err.(type) { 351 case *shared.BadRequestError, 352 *shared.ClientVersionNotSupportedError: 353 return true 354 } 355 return false 356 } 357 358 func (bw *baseWorker) processTask(task interface{}) { 359 defer bw.shutdownWG.Done() 360 // If the task is from poller, after processing it we would need to request a new poll. Otherwise, the task is from 361 // local activity worker, we don't need a new poll from server. 362 polledTask, isPolledTask := task.(*polledTask) 363 if isPolledTask { 364 task = polledTask.task 365 } 366 defer func() { 367 if p := recover(); p != nil { 368 bw.metricsScope.Counter(metrics.WorkerPanicCounter).Inc(1) 369 topLine := fmt.Sprintf("base worker for %s [panic]:", bw.options.workerType) 370 st := getStackTraceRaw(topLine, 7, 0) 371 bw.logger.Error("Unhandled panic.", 372 zap.String(tagPanicError, fmt.Sprintf("%v", p)), 373 zap.String(tagPanicStack, st)) 374 } 375 376 if isPolledTask { 377 bw.pollerRequestCh <- struct{}{} 378 } 379 }() 380 err := bw.options.taskWorker.ProcessTask(task) 381 if err != nil { 382 if isClientSideError(err) { 383 bw.logger.Info("Task processing failed with client side error", zap.Error(err)) 384 } else { 385 bw.logger.Info("Task processing failed with error", zap.Error(err)) 386 } 387 } 388 } 389 390 func (bw *baseWorker) Run() { 391 bw.Start() 392 d := <-getKillSignal() 393 traceLog(func() { 394 bw.logger.Info("Worker has been killed", zap.String("Signal", d.String())) 395 }) 396 bw.Stop() 397 } 398 399 // Shutdown is a blocking call and cleans up all the resources associated with worker. 400 func (bw *baseWorker) Stop() { 401 if !bw.isWorkerStarted { 402 return 403 } 404 close(bw.shutdownCh) 405 bw.limiterContextCancel() 406 if bw.pollerAutoScaler != nil { 407 bw.pollerAutoScaler.Stop() 408 } 409 410 if success := util.AwaitWaitGroup(&bw.shutdownWG, bw.options.shutdownTimeout); !success { 411 traceLog(func() { 412 bw.logger.Info("Worker graceful shutdown timed out.", zap.Duration("Shutdown timeout", bw.options.shutdownTimeout)) 413 }) 414 } 415 416 // Close context 417 if bw.options.userContextCancel != nil { 418 bw.options.userContextCancel() 419 } 420 return 421 } 422 423 func (bw *baseWorker) emitHardwareUsage() { 424 defer func() { 425 if p := recover(); p != nil { 426 bw.metricsScope.Counter(metrics.WorkerPanicCounter).Inc(1) 427 topLine := fmt.Sprintf("base worker for %s [panic]:", bw.options.workerType) 428 st := getStackTraceRaw(topLine, 7, 0) 429 bw.logger.Error("Unhandled panic in hardware emitting.", 430 zap.String(tagPanicError, fmt.Sprintf("%v", p)), 431 zap.String(tagPanicStack, st)) 432 } 433 }() 434 defer bw.shutdownWG.Done() 435 collectHardwareUsageOnce.Do( 436 func() { 437 ticker := time.NewTicker(hardwareMetricsCollectInterval) 438 for { 439 select { 440 case <-bw.shutdownCh: 441 ticker.Stop() 442 return 443 case <-ticker.C: 444 host := bw.options.host 445 scope := bw.metricsScope.Tagged(map[string]string{clientHostTag: host}) 446 447 cpuPercent, err := cpu.Percent(0, false) 448 if err != nil { 449 bw.logger.Warn("Failed to get cpu percent", zap.Error(err)) 450 return 451 } 452 cpuCores, err := cpu.Counts(false) 453 if err != nil { 454 bw.logger.Warn("Failed to get number of cpu cores", zap.Error(err)) 455 return 456 } 457 scope.Gauge(metrics.NumCPUCores).Update(float64(cpuCores)) 458 scope.Gauge(metrics.CPUPercentage).Update(cpuPercent[0]) 459 460 var memStats runtime.MemStats 461 runtime.ReadMemStats(&memStats) 462 463 scope.Gauge(metrics.NumGoRoutines).Update(float64(runtime.NumGoroutine())) 464 scope.Gauge(metrics.TotalMemory).Update(float64(memStats.Sys)) 465 scope.Gauge(metrics.MemoryUsedHeap).Update(float64(memStats.HeapInuse)) 466 scope.Gauge(metrics.MemoryUsedStack).Update(float64(memStats.StackInuse)) 467 } 468 } 469 }) 470 471 }