go.uber.org/cadence@v1.2.9/internal/internal_task_handlers.go (about) 1 // Copyright (c) 2017-2020 Uber Technologies Inc. 2 // Portions of the Software are attributed to Copyright (c) 2020 Temporal Technologies Inc. 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 package internal 23 24 // All code in this file is private to the package. 25 26 import ( 27 "context" 28 "errors" 29 "fmt" 30 "math" 31 "reflect" 32 "strings" 33 "sync" 34 "sync/atomic" 35 "time" 36 37 "github.com/opentracing/opentracing-go" 38 "github.com/uber-go/tally" 39 "go.uber.org/zap" 40 41 "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" 42 s "go.uber.org/cadence/.gen/go/shared" 43 "go.uber.org/cadence/internal/common" 44 "go.uber.org/cadence/internal/common/backoff" 45 "go.uber.org/cadence/internal/common/cache" 46 "go.uber.org/cadence/internal/common/metrics" 47 ) 48 49 const ( 50 defaultHeartBeatIntervalInSec = 10 * 60 51 52 defaultStickyCacheSize = 10000 53 54 noRetryBackoff = time.Duration(-1) 55 56 defaultInstantLivedWorkflowTimeoutUpperLimitInSec = 1 57 58 defaultShortLivedWorkflowTimeoutUpperLimitInSec = 1 * 1800 59 60 defaultMediumLivedWorkflowTimeoutUpperLimitInSec = 8 * 3600 61 ) 62 63 type ( 64 // workflowExecutionEventHandler process a single event. 65 workflowExecutionEventHandler interface { 66 // Process a single event and return the assosciated decisions. 67 // Return List of decisions made, any error. 68 ProcessEvent(event *s.HistoryEvent, isReplay bool, isLast bool) error 69 // ProcessQuery process a query request. 70 ProcessQuery(queryType string, queryArgs []byte) ([]byte, error) 71 StackTrace() string 72 // Close for cleaning up resources on this event handler 73 Close() 74 } 75 76 // workflowTask wraps a decision task. 77 workflowTask struct { 78 task *s.PollForDecisionTaskResponse 79 historyIterator HistoryIterator 80 doneCh chan struct{} 81 laResultCh chan *localActivityResult 82 } 83 84 // activityTask wraps a activity task. 85 activityTask struct { 86 task *s.PollForActivityTaskResponse 87 pollStartTime time.Time 88 } 89 90 // resetStickinessTask wraps a ResetStickyTaskListRequest. 91 resetStickinessTask struct { 92 task *s.ResetStickyTaskListRequest 93 } 94 95 // workflowExecutionContextImpl is the cached workflow state for sticky execution 96 workflowExecutionContextImpl struct { 97 mutex sync.Mutex 98 workflowStartTime time.Time 99 workflowInfo *WorkflowInfo 100 wth *workflowTaskHandlerImpl 101 102 // eventHandler is changed to a atomic.Value as a temporally bug fix for local activity 103 // retry issue (github issue #915). Therefore, when accessing/modifying this field, the 104 // mutex should still be held. 105 eventHandler atomic.Value 106 107 isWorkflowCompleted bool 108 result []byte 109 err error 110 111 previousStartedEventID int64 112 113 newDecisions []*s.Decision 114 currentDecisionTask *s.PollForDecisionTaskResponse 115 laTunnel *localActivityTunnel 116 decisionStartTime time.Time 117 } 118 119 // workflowTaskHandlerImpl is the implementation of WorkflowTaskHandler 120 workflowTaskHandlerImpl struct { 121 domain string 122 metricsScope *metrics.TaggedScope 123 ppMgr pressurePointMgr 124 logger *zap.Logger 125 identity string 126 enableLoggingInReplay bool 127 disableStickyExecution bool 128 registry *registry 129 laTunnel *localActivityTunnel 130 nonDeterministicWorkflowPolicy NonDeterministicWorkflowPolicy 131 dataConverter DataConverter 132 contextPropagators []ContextPropagator 133 tracer opentracing.Tracer 134 workflowInterceptorFactories []WorkflowInterceptorFactory 135 disableStrictNonDeterminism bool 136 } 137 138 activityProvider func(name string) activity 139 140 // activityTaskHandlerImpl is the implementation of ActivityTaskHandler 141 activityTaskHandlerImpl struct { 142 taskListName string 143 identity string 144 service workflowserviceclient.Interface 145 metricsScope *metrics.TaggedScope 146 logger *zap.Logger 147 userContext context.Context 148 registry *registry 149 activityProvider activityProvider 150 dataConverter DataConverter 151 workerStopCh <-chan struct{} 152 contextPropagators []ContextPropagator 153 tracer opentracing.Tracer 154 featureFlags FeatureFlags 155 } 156 157 // history wrapper method to help information about events. 158 history struct { 159 workflowTask *workflowTask 160 eventsHandler *workflowExecutionEventHandlerImpl 161 loadedEvents []*s.HistoryEvent 162 currentIndex int 163 nextEventID int64 // next expected eventID for sanity 164 lastEventID int64 // last expected eventID, zero indicates read until end of stream 165 next []*s.HistoryEvent 166 binaryChecksum *string 167 } 168 169 decisionHeartbeatError struct { 170 Message string 171 } 172 ) 173 174 func newHistory(task *workflowTask, eventsHandler *workflowExecutionEventHandlerImpl) *history { 175 result := &history{ 176 workflowTask: task, 177 eventsHandler: eventsHandler, 178 loadedEvents: task.task.History.Events, 179 currentIndex: 0, 180 // don't set lastEventID to task.GetNextEventId() 181 // as for sticky query, the history in workflow task will be empty 182 // and query will be run based on existing workflow state. 183 // so the sanity check in verifyAllEventsProcessed will fail 184 lastEventID: task.task.GetStartedEventId(), 185 } 186 if len(result.loadedEvents) > 0 { 187 result.nextEventID = result.loadedEvents[0].GetEventId() 188 } 189 return result 190 } 191 192 func (e decisionHeartbeatError) Error() string { 193 return e.Message 194 } 195 196 // Get workflow start event. 197 func (eh *history) GetWorkflowStartedEvent() (*s.HistoryEvent, error) { 198 events := eh.workflowTask.task.History.Events 199 if len(events) == 0 || events[0].GetEventType() != s.EventTypeWorkflowExecutionStarted { 200 return nil, errors.New("unable to find WorkflowExecutionStartedEventAttributes in the history") 201 } 202 return events[0], nil 203 } 204 205 func (eh *history) IsReplayEvent(event *s.HistoryEvent) bool { 206 return event.GetEventId() <= eh.workflowTask.task.GetPreviousStartedEventId() || isDecisionEvent(event.GetEventType()) 207 } 208 209 func (eh *history) IsNextDecisionFailed() (isFailed bool, binaryChecksum *string, err error) { 210 211 nextIndex := eh.currentIndex + 1 212 if nextIndex >= len(eh.loadedEvents) && eh.hasMoreEvents() { // current page ends and there is more pages 213 if err := eh.loadMoreEvents(); err != nil { 214 return false, nil, err 215 } 216 } 217 218 if nextIndex < len(eh.loadedEvents) { 219 nextEvent := eh.loadedEvents[nextIndex] 220 nextEventType := nextEvent.GetEventType() 221 isFailed := nextEventType == s.EventTypeDecisionTaskTimedOut || nextEventType == s.EventTypeDecisionTaskFailed 222 var binaryChecksum *string 223 if nextEventType == s.EventTypeDecisionTaskCompleted { 224 binaryChecksum = nextEvent.DecisionTaskCompletedEventAttributes.BinaryChecksum 225 } 226 return isFailed, binaryChecksum, nil 227 } 228 return false, nil, nil 229 } 230 231 func (eh *history) loadMoreEvents() error { 232 historyPage, err := eh.getMoreEvents() 233 if err != nil { 234 return err 235 } 236 eh.loadedEvents = append(eh.loadedEvents, historyPage.Events...) 237 if eh.nextEventID == 0 && len(eh.loadedEvents) > 0 { 238 eh.nextEventID = eh.loadedEvents[0].GetEventId() 239 } 240 return nil 241 } 242 243 func isDecisionEvent(eventType s.EventType) bool { 244 switch eventType { 245 case s.EventTypeWorkflowExecutionCompleted, 246 s.EventTypeWorkflowExecutionFailed, 247 s.EventTypeWorkflowExecutionCanceled, 248 s.EventTypeWorkflowExecutionContinuedAsNew, 249 s.EventTypeActivityTaskScheduled, 250 s.EventTypeActivityTaskCancelRequested, 251 s.EventTypeTimerStarted, 252 s.EventTypeTimerCanceled, 253 s.EventTypeCancelTimerFailed, 254 s.EventTypeMarkerRecorded, 255 s.EventTypeStartChildWorkflowExecutionInitiated, 256 s.EventTypeRequestCancelExternalWorkflowExecutionInitiated, 257 s.EventTypeSignalExternalWorkflowExecutionInitiated, 258 s.EventTypeUpsertWorkflowSearchAttributes: 259 return true 260 default: 261 return false 262 } 263 } 264 265 // NextDecisionEvents returns events that there processed as new by the next decision. 266 // TODO(maxim): Refactor to return a struct instead of multiple parameters 267 func (eh *history) NextDecisionEvents() (result []*s.HistoryEvent, markers []*s.HistoryEvent, binaryChecksum *string, err error) { 268 if eh.next == nil { 269 eh.next, _, err = eh.nextDecisionEvents() 270 if err != nil { 271 return result, markers, eh.binaryChecksum, err 272 } 273 } 274 275 result = eh.next 276 checksum := eh.binaryChecksum 277 if len(result) > 0 { 278 eh.next, markers, err = eh.nextDecisionEvents() 279 } 280 return result, markers, checksum, err 281 } 282 283 func (eh *history) HasNextDecisionEvents() bool { 284 return len(eh.next) != 0 || eh.currentIndex != len(eh.loadedEvents) || eh.hasMoreEvents() 285 } 286 287 func (eh *history) hasMoreEvents() bool { 288 historyIterator := eh.workflowTask.historyIterator 289 return historyIterator != nil && historyIterator.HasNextPage() 290 } 291 292 func (eh *history) getMoreEvents() (*s.History, error) { 293 return eh.workflowTask.historyIterator.GetNextPage() 294 } 295 296 func (eh *history) verifyAllEventsProcessed() error { 297 if eh.lastEventID > 0 && eh.nextEventID <= eh.lastEventID { 298 return fmt.Errorf( 299 "history_events: premature end of stream, expectedLastEventID=%v but no more events after eventID=%v", 300 eh.lastEventID, 301 eh.nextEventID-1) 302 } 303 if eh.lastEventID > 0 && eh.nextEventID != (eh.lastEventID+1) { 304 eh.eventsHandler.logger.Warn( 305 "history_events: processed events past the expected lastEventID", 306 zap.Int64("expectedLastEventID", eh.lastEventID), 307 zap.Int64("processedLastEventID", eh.nextEventID-1)) 308 } 309 return nil 310 } 311 312 func (eh *history) nextDecisionEvents() (nextEvents []*s.HistoryEvent, markers []*s.HistoryEvent, err error) { 313 if eh.currentIndex == len(eh.loadedEvents) && !eh.hasMoreEvents() { 314 if err := eh.verifyAllEventsProcessed(); err != nil { 315 return nil, nil, err 316 } 317 return []*s.HistoryEvent{}, []*s.HistoryEvent{}, nil 318 } 319 320 // Process events 321 322 OrderEvents: 323 for { 324 // load more history events if needed 325 for eh.currentIndex == len(eh.loadedEvents) { 326 if !eh.hasMoreEvents() { 327 if err = eh.verifyAllEventsProcessed(); err != nil { 328 return 329 } 330 break OrderEvents 331 } 332 if err = eh.loadMoreEvents(); err != nil { 333 return 334 } 335 } 336 337 event := eh.loadedEvents[eh.currentIndex] 338 eventID := event.GetEventId() 339 if eventID != eh.nextEventID { 340 err = fmt.Errorf( 341 "missing history events, expectedNextEventID=%v but receivedNextEventID=%v", 342 eh.nextEventID, eventID) 343 return 344 } 345 346 eh.nextEventID++ 347 348 switch event.GetEventType() { 349 case s.EventTypeDecisionTaskStarted: 350 isFailed, binaryChecksum, err1 := eh.IsNextDecisionFailed() 351 if err1 != nil { 352 err = err1 353 return 354 } 355 if !isFailed { 356 eh.binaryChecksum = binaryChecksum 357 eh.currentIndex++ 358 nextEvents = append(nextEvents, event) 359 break OrderEvents 360 } 361 case s.EventTypeDecisionTaskScheduled, 362 s.EventTypeDecisionTaskTimedOut, 363 s.EventTypeDecisionTaskFailed: 364 // Skip 365 default: 366 if isPreloadMarkerEvent(event) { 367 markers = append(markers, event) 368 } 369 nextEvents = append(nextEvents, event) 370 } 371 eh.currentIndex++ 372 } 373 374 // shrink loaded events so it can be GCed 375 eh.loadedEvents = eh.loadedEvents[eh.currentIndex:] 376 eh.currentIndex = 0 377 378 return nextEvents, markers, nil 379 } 380 381 func isPreloadMarkerEvent(event *s.HistoryEvent) bool { 382 return event.GetEventType() == s.EventTypeMarkerRecorded 383 } 384 385 // newWorkflowTaskHandler returns an implementation of workflow task handler. 386 func newWorkflowTaskHandler( 387 domain string, 388 params workerExecutionParameters, 389 ppMgr pressurePointMgr, 390 registry *registry, 391 ) WorkflowTaskHandler { 392 ensureRequiredParams(¶ms) 393 wth := &workflowTaskHandlerImpl{ 394 domain: domain, 395 logger: params.Logger, 396 ppMgr: ppMgr, 397 metricsScope: metrics.NewTaggedScope(params.MetricsScope), 398 identity: params.Identity, 399 enableLoggingInReplay: params.EnableLoggingInReplay, 400 disableStickyExecution: params.DisableStickyExecution, 401 registry: registry, 402 nonDeterministicWorkflowPolicy: params.NonDeterministicWorkflowPolicy, 403 dataConverter: params.DataConverter, 404 contextPropagators: params.ContextPropagators, 405 tracer: params.Tracer, 406 workflowInterceptorFactories: params.WorkflowInterceptorChainFactories, 407 disableStrictNonDeterminism: params.WorkerBugPorts.DisableStrictNonDeterminismCheck, 408 } 409 410 traceLog(func() { 411 wth.logger.Debug("Workflow task handler is created.", 412 zap.String(tagDomain, wth.domain), 413 zap.Bool("disableStrictNonDeterminism", wth.disableStrictNonDeterminism)) 414 }) 415 416 return wth 417 } 418 419 // TODO: need a better eviction policy based on memory usage 420 var workflowCache cache.Cache 421 var stickyCacheSize = defaultStickyCacheSize 422 var initCacheOnce sync.Once 423 var stickyCacheLock sync.Mutex 424 425 // SetStickyWorkflowCacheSize sets the cache size for sticky workflow cache. Sticky workflow execution is the affinity 426 // between decision tasks of a specific workflow execution to a specific worker. The affinity is set if sticky execution 427 // is enabled via Worker.Options (It is enabled by default unless disabled explicitly). The benefit of sticky execution 428 // is that workflow does not have to reconstruct the state by replaying from beginning of history events. But the cost 429 // is it consumes more memory as it rely on caching workflow execution's running state on the worker. The cache is shared 430 // between workers running within same process. This must be called before any worker is started. If not called, the 431 // default size of 10K (might change in future) will be used. 432 func SetStickyWorkflowCacheSize(cacheSize int) { 433 stickyCacheLock.Lock() 434 defer stickyCacheLock.Unlock() 435 if workflowCache != nil { 436 panic("cache already created, please set cache size before worker starts.") 437 } 438 stickyCacheSize = cacheSize 439 } 440 441 func getWorkflowCache() cache.Cache { 442 initCacheOnce.Do(func() { 443 stickyCacheLock.Lock() 444 defer stickyCacheLock.Unlock() 445 workflowCache = cache.New(stickyCacheSize, &cache.Options{ 446 RemovedFunc: func(cachedEntity interface{}) { 447 wc := cachedEntity.(*workflowExecutionContextImpl) 448 wc.onEviction() 449 }, 450 }) 451 }) 452 return workflowCache 453 } 454 455 func getWorkflowContext(runID string) *workflowExecutionContextImpl { 456 o := getWorkflowCache().Get(runID) 457 if o == nil { 458 return nil 459 } 460 wc := o.(*workflowExecutionContextImpl) 461 return wc 462 } 463 464 func putWorkflowContext(runID string, wc *workflowExecutionContextImpl) (*workflowExecutionContextImpl, error) { 465 existing, err := getWorkflowCache().PutIfNotExist(runID, wc) 466 if err != nil { 467 return nil, err 468 } 469 return existing.(*workflowExecutionContextImpl), nil 470 } 471 472 func removeWorkflowContext(runID string) { 473 getWorkflowCache().Delete(runID) 474 } 475 476 func newWorkflowExecutionContext( 477 startTime time.Time, 478 workflowInfo *WorkflowInfo, 479 taskHandler *workflowTaskHandlerImpl, 480 ) *workflowExecutionContextImpl { 481 workflowContext := &workflowExecutionContextImpl{ 482 workflowStartTime: startTime, 483 workflowInfo: workflowInfo, 484 wth: taskHandler, 485 } 486 workflowContext.createEventHandler() 487 return workflowContext 488 } 489 490 func (w *workflowExecutionContextImpl) Lock() { 491 w.mutex.Lock() 492 } 493 494 func (w *workflowExecutionContextImpl) Unlock(err error) { 495 cleared := false 496 cached := getWorkflowCache().Exist(w.workflowInfo.WorkflowExecution.RunID) 497 if err != nil || w.err != nil || w.isWorkflowCompleted || (w.wth.disableStickyExecution && !w.hasPendingLocalActivityWork()) { 498 // TODO: in case of closed, it assumes the close decision always succeed. need server side change to return 499 // error to indicate the close failure case. This should be rare case. For now, always remove the cache, and 500 // if the close decision failed, the next decision will have to rebuild the state. 501 if cached { 502 // also clears state asynchronously via cache eviction 503 removeWorkflowContext(w.workflowInfo.WorkflowExecution.RunID) 504 } else { 505 w.clearState() 506 } 507 cleared = true 508 } 509 // there are a variety of reasons a workflow may not have been put into the cache. 510 // all of them mean we need to clear the state at this point, or any running goroutines will be orphaned. 511 if !cleared && !cached { 512 w.clearState() 513 } 514 515 w.mutex.Unlock() 516 } 517 518 func (w *workflowExecutionContextImpl) getEventHandler() *workflowExecutionEventHandlerImpl { 519 eventHandler := w.eventHandler.Load() 520 if eventHandler == nil { 521 return nil 522 } 523 eventHandlerImpl, ok := eventHandler.(*workflowExecutionEventHandlerImpl) 524 if !ok { 525 panic("unknown type for workflow execution event handler") 526 } 527 return eventHandlerImpl 528 } 529 530 func (w *workflowExecutionContextImpl) completeWorkflow(result []byte, err error) { 531 w.isWorkflowCompleted = true 532 w.result = result 533 w.err = err 534 } 535 536 func (w *workflowExecutionContextImpl) shouldResetStickyOnEviction() bool { 537 // Not all evictions from the cache warrant a call to the server 538 // to reset stickiness. 539 // Cases when this is redundant or unnecessary include 540 // when an error was encountered during execution 541 // or workflow simply completed successfully. 542 return w.err == nil && !w.isWorkflowCompleted 543 } 544 545 func (w *workflowExecutionContextImpl) onEviction() { 546 // onEviction is run by LRU cache's removeFunc in separate goroutinue 547 w.mutex.Lock() 548 549 // Queue a ResetStickiness request *BEFORE* calling clearState 550 // because once destroyed, no sensible information 551 // may be ascertained about the execution context's state, 552 // nor should any of its methods be invoked. 553 if w.shouldResetStickyOnEviction() { 554 w.queueResetStickinessTask() 555 } 556 557 w.clearState() 558 w.mutex.Unlock() 559 } 560 561 func (w *workflowExecutionContextImpl) IsDestroyed() bool { 562 return w.getEventHandler() == nil 563 } 564 565 func (w *workflowExecutionContextImpl) queueResetStickinessTask() { 566 var task resetStickinessTask 567 task.task = &s.ResetStickyTaskListRequest{ 568 Domain: common.StringPtr(w.workflowInfo.Domain), 569 Execution: &s.WorkflowExecution{ 570 WorkflowId: common.StringPtr(w.workflowInfo.WorkflowExecution.ID), 571 RunId: common.StringPtr(w.workflowInfo.WorkflowExecution.RunID), 572 }, 573 } 574 // w.laTunnel could be nil for worker.ReplayHistory() because there is no worker started, in that case we don't 575 // care about resetStickinessTask. 576 if w.laTunnel != nil && w.laTunnel.resultCh != nil { 577 w.laTunnel.resultCh <- &task 578 } 579 } 580 581 func (w *workflowExecutionContextImpl) clearState() { 582 w.clearCurrentTask() 583 w.isWorkflowCompleted = false 584 w.result = nil 585 w.err = nil 586 w.previousStartedEventID = 0 587 w.newDecisions = nil 588 589 eventHandler := w.getEventHandler() 590 if eventHandler != nil { 591 // Set isReplay to true to prevent user code in defer guarded by !isReplaying() from running 592 eventHandler.isReplay = true 593 eventHandler.Close() 594 w.eventHandler.Store((*workflowExecutionEventHandlerImpl)(nil)) 595 } 596 } 597 598 func (w *workflowExecutionContextImpl) createEventHandler() { 599 w.clearState() 600 eventHandler := newWorkflowExecutionEventHandler( 601 w.workflowInfo, 602 w.completeWorkflow, 603 w.wth.logger, 604 w.wth.enableLoggingInReplay, 605 w.wth.metricsScope, 606 w.wth.registry, 607 w.wth.dataConverter, 608 w.wth.contextPropagators, 609 w.wth.tracer, 610 w.wth.workflowInterceptorFactories, 611 ) 612 w.eventHandler.Store(eventHandler) 613 } 614 615 func resetHistory(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) (*s.History, error) { 616 historyIterator.Reset() 617 firstPageHistory, err := historyIterator.GetNextPage() 618 if err != nil { 619 return nil, err 620 } 621 task.History = firstPageHistory 622 return firstPageHistory, nil 623 } 624 625 func (wth *workflowTaskHandlerImpl) createWorkflowContext(task *s.PollForDecisionTaskResponse) (*workflowExecutionContextImpl, error) { 626 h := task.History 627 attributes := h.Events[0].WorkflowExecutionStartedEventAttributes 628 if attributes == nil { 629 return nil, errors.New("first history event is not WorkflowExecutionStarted") 630 } 631 taskList := attributes.TaskList 632 if taskList == nil { 633 return nil, errors.New("nil TaskList in WorkflowExecutionStarted event") 634 } 635 636 runID := task.WorkflowExecution.GetRunId() 637 workflowID := task.WorkflowExecution.GetWorkflowId() 638 639 // Setup workflow Info 640 var parentWorkflowExecution *WorkflowExecution 641 if attributes.ParentWorkflowExecution != nil { 642 parentWorkflowExecution = &WorkflowExecution{ 643 ID: attributes.ParentWorkflowExecution.GetWorkflowId(), 644 RunID: attributes.ParentWorkflowExecution.GetRunId(), 645 } 646 } 647 workflowInfo := &WorkflowInfo{ 648 WorkflowExecution: WorkflowExecution{ 649 ID: workflowID, 650 RunID: runID, 651 }, 652 OriginalRunId: attributes.GetOriginalExecutionRunId(), 653 WorkflowType: flowWorkflowTypeFrom(*task.WorkflowType), 654 TaskListName: taskList.GetName(), 655 ExecutionStartToCloseTimeoutSeconds: attributes.GetExecutionStartToCloseTimeoutSeconds(), 656 TaskStartToCloseTimeoutSeconds: attributes.GetTaskStartToCloseTimeoutSeconds(), 657 Domain: wth.domain, 658 Attempt: attributes.GetAttempt(), 659 lastCompletionResult: attributes.LastCompletionResult, 660 CronSchedule: attributes.CronSchedule, 661 ContinuedExecutionRunID: attributes.ContinuedExecutionRunId, 662 ParentWorkflowDomain: attributes.ParentWorkflowDomain, 663 ParentWorkflowExecution: parentWorkflowExecution, 664 Memo: attributes.Memo, 665 SearchAttributes: attributes.SearchAttributes, 666 RetryPolicy: attributes.RetryPolicy, 667 } 668 669 wfStartTime := time.Unix(0, h.Events[0].GetTimestamp()) 670 return newWorkflowExecutionContext(wfStartTime, workflowInfo, wth), nil 671 } 672 673 func (wth *workflowTaskHandlerImpl) getOrCreateWorkflowContext( 674 task *s.PollForDecisionTaskResponse, 675 historyIterator HistoryIterator, 676 ) (workflowContext *workflowExecutionContextImpl, err error) { 677 metricsScope := wth.metricsScope.GetTaggedScope(tagWorkflowType, task.WorkflowType.GetName()) 678 defer func(metricsScope tally.Scope) { 679 if err == nil && workflowContext != nil && workflowContext.laTunnel == nil { 680 workflowContext.laTunnel = wth.laTunnel 681 } 682 metricsScope.Gauge(metrics.StickyCacheSize).Update(float64(getWorkflowCache().Size())) 683 }(metricsScope) 684 685 runID := task.WorkflowExecution.GetRunId() 686 687 history := task.History 688 isFullHistory := isFullHistory(history) 689 690 workflowContext = nil 691 if task.Query == nil || (task.Query != nil && !isFullHistory) { 692 workflowContext = getWorkflowContext(runID) 693 } 694 695 if workflowContext != nil { 696 workflowContext.Lock() 697 // add new tag on metrics scope with workflow runtime length category 698 scope := metricsScope.Tagged(map[string]string{tagWorkflowRuntimeLength: workflowCategorizedByTimeout(workflowContext)}) 699 if task.Query != nil && !isFullHistory { 700 // query task and we have a valid cached state 701 scope.Counter(metrics.StickyCacheHit).Inc(1) 702 } else if history.Events[0].GetEventId() == workflowContext.previousStartedEventID+1 { 703 // non query task and we have a valid cached state 704 scope.Counter(metrics.StickyCacheHit).Inc(1) 705 } else { 706 // non query task and cached state is missing events, we need to discard the cached state and rebuild one. 707 workflowContext.ResetIfStale(task, historyIterator) 708 } 709 } else { 710 if !isFullHistory { 711 // we are getting partial history task, but cached state was already evicted. 712 // we need to reset history so we get events from beginning to replay/rebuild the state 713 metricsScope.Counter(metrics.StickyCacheMiss).Inc(1) 714 if history, err = resetHistory(task, historyIterator); err != nil { 715 return 716 } 717 } 718 719 if workflowContext, err = wth.createWorkflowContext(task); err != nil { 720 return 721 } 722 723 if !wth.disableStickyExecution && task.Query == nil { 724 workflowContext, _ = putWorkflowContext(runID, workflowContext) 725 } 726 workflowContext.Lock() 727 } 728 729 err = workflowContext.resetStateIfDestroyed(task, historyIterator) 730 if err != nil { 731 workflowContext.Unlock(err) 732 } 733 734 return 735 } 736 737 func isFullHistory(history *s.History) bool { 738 if len(history.Events) == 0 || history.Events[0].GetEventType() != s.EventTypeWorkflowExecutionStarted { 739 return false 740 } 741 return true 742 } 743 744 func (w *workflowExecutionContextImpl) resetStateIfDestroyed(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) error { 745 // It is possible that 2 threads (one for decision task and one for query task) that both are getting this same 746 // cached workflowContext. If one task finished with err, it would destroy the cached state. In that case, the 747 // second task needs to reset the cache state and start from beginning of the history. 748 if w.IsDestroyed() { 749 w.createEventHandler() 750 // reset history events if necessary 751 if !isFullHistory(task.History) { 752 if _, err := resetHistory(task, historyIterator); err != nil { 753 return err 754 } 755 } 756 } 757 return nil 758 } 759 760 // ProcessWorkflowTask processes all the events of the workflow task. 761 func (wth *workflowTaskHandlerImpl) ProcessWorkflowTask( 762 workflowTask *workflowTask, 763 heartbeatFunc decisionHeartbeatFunc, 764 ) (completeRequest interface{}, errRet error) { 765 if workflowTask == nil || workflowTask.task == nil { 766 return nil, errors.New("nil workflow task provided") 767 } 768 task := workflowTask.task 769 if task.History == nil || len(task.History.Events) == 0 { 770 task.History = &s.History{ 771 Events: []*s.HistoryEvent{}, 772 } 773 } 774 if task.Query == nil && len(task.History.Events) == 0 { 775 return nil, errors.New("nil or empty history") 776 } 777 778 if task.Query != nil && len(task.Queries) != 0 { 779 return nil, errors.New("invalid query decision task") 780 } 781 782 runID := task.WorkflowExecution.GetRunId() 783 workflowID := task.WorkflowExecution.GetWorkflowId() 784 traceLog(func() { 785 wth.logger.Debug("Processing new workflow task.", 786 zap.String(tagWorkflowType, task.WorkflowType.GetName()), 787 zap.String(tagWorkflowID, workflowID), 788 zap.String(tagRunID, runID), 789 zap.Int64("PreviousStartedEventId", task.GetPreviousStartedEventId())) 790 }) 791 792 workflowContext, err := wth.getOrCreateWorkflowContext(task, workflowTask.historyIterator) 793 if err != nil { 794 return nil, err 795 } 796 797 defer func() { 798 workflowContext.Unlock(errRet) 799 }() 800 801 var response interface{} 802 process_Workflow_Loop: 803 for { 804 startTime := time.Now() 805 response, err = workflowContext.ProcessWorkflowTask(workflowTask) 806 if err == nil && response == nil { 807 wait_LocalActivity_Loop: 808 for { 809 deadlineToTrigger := time.Duration(float32(ratioToForceCompleteDecisionTaskComplete) * float32(workflowContext.GetDecisionTimeout())) 810 delayDuration := startTime.Add(deadlineToTrigger).Sub(time.Now()) 811 select { 812 case <-time.After(delayDuration): 813 // force complete, call the decision heartbeat function 814 workflowTask, err = heartbeatFunc( 815 workflowContext.CompleteDecisionTask(workflowTask, false), 816 startTime, 817 ) 818 if err != nil { 819 return nil, &decisionHeartbeatError{Message: fmt.Sprintf("error sending decision heartbeat %v", err)} 820 } 821 if workflowTask == nil { 822 return nil, nil 823 } 824 continue process_Workflow_Loop 825 826 case lar := <-workflowTask.laResultCh: 827 // local activity result ready 828 response, err = workflowContext.ProcessLocalActivityResult(workflowTask, lar) 829 if err == nil && response == nil { 830 // decision task is not done yet, still waiting for more local activities 831 continue wait_LocalActivity_Loop 832 } 833 break process_Workflow_Loop 834 } 835 } 836 } else { 837 break process_Workflow_Loop 838 } 839 } 840 return response, err 841 } 842 843 func (w *workflowExecutionContextImpl) ProcessWorkflowTask(workflowTask *workflowTask) (interface{}, error) { 844 task := workflowTask.task 845 historyIterator := workflowTask.historyIterator 846 w.workflowInfo.HistoryBytesServer = task.GetTotalHistoryBytes() 847 w.workflowInfo.HistoryCount = task.GetNextEventId() - 1 848 if err := w.ResetIfStale(task, historyIterator); err != nil { 849 return nil, err 850 } 851 w.SetCurrentTask(task) 852 853 eventHandler := w.getEventHandler() 854 reorderedHistory := newHistory(workflowTask, eventHandler) 855 var replayDecisions []*s.Decision 856 var respondEvents []*s.HistoryEvent 857 858 skipReplayCheck := w.skipReplayCheck() 859 isReplayTest := task.GetPreviousStartedEventId() == replayPreviousStartedEventID 860 if isReplayTest { 861 w.wth.logger.Info("Processing workflow task in replay test mode", 862 zap.String(tagWorkflowType, task.WorkflowType.GetName()), 863 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 864 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 865 ) 866 } 867 // Process events 868 ProcessEvents: 869 for { 870 reorderedEvents, markers, binaryChecksum, err := reorderedHistory.NextDecisionEvents() 871 w.wth.metricsScope.GetTaggedScope("workflowtype", w.workflowInfo.WorkflowType.Name).Gauge(metrics.EstimatedHistorySize).Update(float64(w.workflowInfo.TotalHistoryBytes)) 872 w.wth.metricsScope.GetTaggedScope("workflowtype", w.workflowInfo.WorkflowType.Name).Gauge(metrics.ServerSideHistorySize).Update(float64(w.workflowInfo.HistoryBytesServer)) 873 if err != nil { 874 return nil, err 875 } 876 877 if len(reorderedEvents) == 0 { 878 break ProcessEvents 879 } 880 if binaryChecksum == nil { 881 w.workflowInfo.BinaryChecksum = common.StringPtr(getBinaryChecksum()) 882 } else { 883 w.workflowInfo.BinaryChecksum = binaryChecksum 884 } 885 // Markers are from the events that are produced from the current decision 886 for _, m := range markers { 887 if m.MarkerRecordedEventAttributes.GetMarkerName() != localActivityMarkerName { 888 // local activity marker needs to be applied after decision task started event 889 err := eventHandler.ProcessEvent(m, true, false) 890 if err != nil { 891 return nil, err 892 } 893 if w.isWorkflowCompleted { 894 break ProcessEvents 895 } 896 } 897 } 898 899 for i, event := range reorderedEvents { 900 isInReplay := reorderedHistory.IsReplayEvent(event) 901 isLast := !isInReplay && i == len(reorderedEvents)-1 902 if !skipReplayCheck && isDecisionEvent(event.GetEventType()) { 903 respondEvents = append(respondEvents, event) 904 } 905 906 if isPreloadMarkerEvent(event) { 907 // marker events are processed separately 908 continue 909 } 910 911 // Any pressure points. 912 err := w.wth.executeAnyPressurePoints(event, isInReplay) 913 if err != nil { 914 return nil, err 915 } 916 917 err = eventHandler.ProcessEvent(event, isInReplay, isLast) 918 if err != nil { 919 return nil, err 920 } 921 if w.isWorkflowCompleted { 922 break ProcessEvents 923 } 924 } 925 926 // now apply local activity markers 927 for _, m := range markers { 928 if m.MarkerRecordedEventAttributes.GetMarkerName() == localActivityMarkerName { 929 err := eventHandler.ProcessEvent(m, true, false) 930 if err != nil { 931 return nil, err 932 } 933 if w.isWorkflowCompleted { 934 break ProcessEvents 935 } 936 } 937 } 938 isReplay := len(reorderedEvents) > 0 && reorderedHistory.IsReplayEvent(reorderedEvents[len(reorderedEvents)-1]) 939 lastDecisionEventsForReplayTest := isReplayTest && !reorderedHistory.HasNextDecisionEvents() 940 if isReplay && !lastDecisionEventsForReplayTest { 941 eventDecisions := eventHandler.decisionsHelper.getDecisions(true) 942 if len(eventDecisions) > 0 && !skipReplayCheck { 943 replayDecisions = append(replayDecisions, eventDecisions...) 944 } 945 } 946 } 947 948 // Non-deterministic error could happen in 2 different places: 949 // 1) the replay decisions does not match to history events. This is usually due to non backwards compatible code 950 // change to decider logic. For example, change calling one activity to a different activity. 951 // 2) the decision state machine is trying to make illegal state transition while replay a history event (like 952 // activity task completed), but the corresponding decider code that start the event has been removed. In that case 953 // the replay of that event will panic on the decision state machine and the workflow will be marked as completed 954 // with the panic error. 955 var nonDeterministicErr error 956 var nonDeterminismType nonDeterminismDetectionType 957 if !skipReplayCheck && !w.isWorkflowCompleted || isReplayTest { 958 // check if decisions from reply matches to the history events 959 if err := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents); err != nil { 960 nonDeterministicErr = err 961 nonDeterminismType = nonDeterminismDetectionTypeReplayComparison 962 } 963 } else if panicErr, ok := w.getWorkflowPanicIfIllegaleStatePanic(); ok { 964 // This is a nondeterministic execution which ended up panicking 965 nonDeterministicErr = panicErr 966 nonDeterminismType = nonDeterminismDetectionTypeIllegalStatePanic 967 // Since we know there is an error, we do the replay check to give more context in the log 968 replayErr := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents) 969 w.wth.logger.Error("Illegal state caused panic", 970 zap.String(tagWorkflowType, task.WorkflowType.GetName()), 971 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 972 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 973 zap.Error(nonDeterministicErr), 974 zap.NamedError("ReplayError", replayErr), 975 ) 976 } 977 978 if nonDeterministicErr != nil { 979 scope := w.wth.metricsScope.GetTaggedScope(tagWorkflowType, task.WorkflowType.GetName(), tagNonDeterminismDetectionType, string(nonDeterminismType)) 980 scope.Counter(metrics.NonDeterministicError).Inc(1) 981 w.wth.logger.Error("non-deterministic-error", 982 zap.String(tagWorkflowType, task.WorkflowType.GetName()), 983 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 984 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 985 zap.Error(nonDeterministicErr)) 986 987 switch w.wth.nonDeterministicWorkflowPolicy { 988 case NonDeterministicWorkflowPolicyFailWorkflow: 989 // complete workflow with custom error will fail the workflow 990 eventHandler.Complete(nil, NewCustomError("NonDeterministicWorkflowPolicyFailWorkflow", nonDeterministicErr.Error())) 991 case NonDeterministicWorkflowPolicyBlockWorkflow: 992 // return error here will be convert to DecisionTaskFailed for the first time, and ignored for subsequent 993 // attempts which will cause DecisionTaskTimeout and server will retry forever until issue got fixed or 994 // workflow timeout. 995 return nil, nonDeterministicErr 996 default: 997 panic("unknown mismatched workflow history policy.") 998 } 999 } 1000 1001 return w.CompleteDecisionTask(workflowTask, true), nil 1002 } 1003 1004 func (w *workflowExecutionContextImpl) ProcessLocalActivityResult(workflowTask *workflowTask, lar *localActivityResult) (interface{}, error) { 1005 if lar.err != nil && w.retryLocalActivity(lar) { 1006 return nil, nil // nothing to do here as we are retrying... 1007 } 1008 1009 err := w.getEventHandler().ProcessLocalActivityResult(lar) 1010 if err != nil { 1011 return nil, err 1012 } 1013 1014 return w.CompleteDecisionTask(workflowTask, true), nil 1015 } 1016 1017 func (w *workflowExecutionContextImpl) retryLocalActivity(lar *localActivityResult) bool { 1018 if lar.task.retryPolicy == nil || lar.err == nil || IsCanceledError(lar.err) { 1019 return false 1020 } 1021 1022 backoff := getRetryBackoff(lar, time.Now()) 1023 if backoff > 0 && backoff <= w.GetDecisionTimeout() { 1024 // we need a local retry 1025 time.AfterFunc(backoff, func() { 1026 // TODO: this should not be a separate goroutine as it introduces race condition when accessing eventHandler. 1027 // currently this is solved by changing eventHandler to an atomic.Value. Ideally, this retry timer should be 1028 // part of the event loop for processing the workflow task. 1029 eventHandler := w.getEventHandler() 1030 1031 // if decision heartbeat failed, the workflow execution context will be cleared and eventHandler will be nil 1032 if eventHandler == nil { 1033 return 1034 } 1035 1036 if _, ok := eventHandler.pendingLaTasks[lar.task.activityID]; !ok { 1037 return 1038 } 1039 1040 lar.task.attempt++ 1041 1042 if !w.laTunnel.sendTask(lar.task) { 1043 lar.task.attempt-- 1044 } 1045 }) 1046 return true 1047 } 1048 // Backoff could be large and potentially much larger than DecisionTaskTimeout. We cannot just sleep locally for 1049 // retry. Because it will delay the local activity from complete which keeps the decision task open. In order to 1050 // keep decision task open, we have to keep "heartbeating" current decision task. 1051 // In that case, it is more efficient to create a server timer with backoff duration and retry when that backoff 1052 // timer fires. So here we will return false to indicate we don't need local retry anymore. However, we have to 1053 // store the current attempt and backoff to the same LocalActivityResultMarker so the replay can do the right thing. 1054 // The backoff timer will be created by workflow.ExecuteLocalActivity(). 1055 lar.backoff = backoff 1056 1057 return false 1058 } 1059 1060 func getRetryBackoff(lar *localActivityResult, now time.Time) time.Duration { 1061 p := lar.task.retryPolicy 1062 var errReason string 1063 if len(p.NonRetriableErrorReasons) > 0 { 1064 if lar.err == ErrDeadlineExceeded { 1065 errReason = "timeout:" + s.TimeoutTypeScheduleToClose.String() 1066 } else { 1067 errReason, _ = getErrorDetails(lar.err, nil) 1068 } 1069 } 1070 return getRetryBackoffWithNowTime(p, lar.task.attempt, errReason, now, lar.task.expireTime) 1071 } 1072 1073 func getRetryBackoffWithNowTime(p *RetryPolicy, attempt int32, errReason string, now, expireTime time.Time) time.Duration { 1074 if p.MaximumAttempts == 0 && p.ExpirationInterval == 0 { 1075 return noRetryBackoff 1076 } 1077 1078 if p.MaximumAttempts > 0 && attempt > p.MaximumAttempts-1 { 1079 return noRetryBackoff // max attempt reached 1080 } 1081 1082 backoffInterval := time.Duration(float64(p.InitialInterval) * math.Pow(p.BackoffCoefficient, float64(attempt))) 1083 if backoffInterval <= 0 { 1084 // math.Pow() could overflow 1085 if p.MaximumInterval > 0 { 1086 backoffInterval = p.MaximumInterval 1087 } else { 1088 return noRetryBackoff 1089 } 1090 } 1091 1092 if p.MaximumInterval > 0 && backoffInterval > p.MaximumInterval { 1093 // cap next interval to MaxInterval 1094 backoffInterval = p.MaximumInterval 1095 } 1096 1097 nextScheduleTime := now.Add(backoffInterval) 1098 if !expireTime.IsZero() && nextScheduleTime.After(expireTime) { 1099 return noRetryBackoff 1100 } 1101 1102 // check if error is non-retriable 1103 for _, er := range p.NonRetriableErrorReasons { 1104 if er == errReason { 1105 return noRetryBackoff 1106 } 1107 } 1108 1109 return backoffInterval 1110 } 1111 1112 func (w *workflowExecutionContextImpl) CompleteDecisionTask(workflowTask *workflowTask, waitLocalActivities bool) interface{} { 1113 if w.currentDecisionTask == nil { 1114 return nil 1115 } 1116 eventHandler := w.getEventHandler() 1117 1118 // w.laTunnel could be nil for worker.ReplayHistory() because there is no worker started, in that case we don't 1119 // care about the pending local activities, and just return because the result is ignored anyway by the caller. 1120 if w.hasPendingLocalActivityWork() && w.laTunnel != nil { 1121 if len(eventHandler.unstartedLaTasks) > 0 { 1122 // start new local activity tasks 1123 unstartedLaTasks := make(map[string]struct{}) 1124 for activityID := range eventHandler.unstartedLaTasks { 1125 task := eventHandler.pendingLaTasks[activityID] 1126 task.wc = w 1127 task.workflowTask = workflowTask 1128 if !w.laTunnel.sendTask(task) { 1129 unstartedLaTasks[activityID] = struct{}{} 1130 task.wc = nil 1131 task.workflowTask = nil 1132 } 1133 } 1134 eventHandler.unstartedLaTasks = unstartedLaTasks 1135 } 1136 // cannot complete decision task as there are pending local activities 1137 if waitLocalActivities { 1138 return nil 1139 } 1140 } 1141 1142 eventDecisions := eventHandler.decisionsHelper.getDecisions(true) 1143 if len(eventDecisions) > 0 { 1144 w.newDecisions = append(w.newDecisions, eventDecisions...) 1145 } 1146 1147 completeRequest := w.wth.completeWorkflow(eventHandler, w.currentDecisionTask, w, w.newDecisions, !waitLocalActivities) 1148 w.clearCurrentTask() 1149 1150 return completeRequest 1151 } 1152 1153 func (w *workflowExecutionContextImpl) hasPendingLocalActivityWork() bool { 1154 eventHandler := w.getEventHandler() 1155 return !w.isWorkflowCompleted && 1156 w.currentDecisionTask != nil && 1157 w.currentDecisionTask.Query == nil && // don't run local activity for query task 1158 eventHandler != nil && 1159 len(eventHandler.pendingLaTasks) > 0 1160 } 1161 1162 func (w *workflowExecutionContextImpl) clearCurrentTask() { 1163 w.newDecisions = nil 1164 w.currentDecisionTask = nil 1165 } 1166 1167 func (w *workflowExecutionContextImpl) skipReplayCheck() bool { 1168 return w.currentDecisionTask.Query != nil || !isFullHistory(w.currentDecisionTask.History) 1169 } 1170 1171 func (w *workflowExecutionContextImpl) SetCurrentTask(task *s.PollForDecisionTaskResponse) { 1172 w.currentDecisionTask = task 1173 // do not update the previousStartedEventID for query task 1174 if task.Query == nil { 1175 w.previousStartedEventID = task.GetStartedEventId() 1176 } 1177 w.decisionStartTime = time.Now() 1178 } 1179 1180 func (w *workflowExecutionContextImpl) ResetIfStale(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) error { 1181 if len(task.History.Events) > 0 && task.History.Events[0].GetEventId() != w.previousStartedEventID+1 { 1182 w.wth.logger.Debug("Cached state staled, new task has unexpected events", 1183 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 1184 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 1185 zap.Int64("CachedPreviousStartedEventID", w.previousStartedEventID), 1186 zap.Int64("TaskFirstEventID", task.History.Events[0].GetEventId()), 1187 zap.Int64("TaskStartedEventID", task.GetStartedEventId()), 1188 zap.Int64("TaskPreviousStartedEventID", task.GetPreviousStartedEventId())) 1189 1190 w.wth.metricsScope. 1191 GetTaggedScope(tagWorkflowType, task.WorkflowType.GetName()). 1192 Counter(metrics.StickyCacheStall).Inc(1) 1193 1194 w.clearState() 1195 return w.resetStateIfDestroyed(task, historyIterator) 1196 } 1197 return nil 1198 } 1199 1200 func (w *workflowExecutionContextImpl) GetDecisionTimeout() time.Duration { 1201 return time.Second * time.Duration(w.workflowInfo.TaskStartToCloseTimeoutSeconds) 1202 } 1203 1204 func (w *workflowExecutionContextImpl) getWorkflowPanicIfIllegaleStatePanic() (*workflowPanicError, bool) { 1205 if !w.isWorkflowCompleted || w.err == nil { 1206 return nil, false 1207 } 1208 1209 panicErr, ok := w.err.(*workflowPanicError) 1210 if !ok || panicErr.value == nil { 1211 return nil, false 1212 } 1213 1214 _, ok = panicErr.value.(stateMachineIllegalStatePanic) 1215 if !ok { 1216 return nil, false 1217 } 1218 1219 return panicErr, true 1220 } 1221 1222 func (wth *workflowTaskHandlerImpl) completeWorkflow( 1223 eventHandler *workflowExecutionEventHandlerImpl, 1224 task *s.PollForDecisionTaskResponse, 1225 workflowContext *workflowExecutionContextImpl, 1226 decisions []*s.Decision, 1227 forceNewDecision bool) interface{} { 1228 1229 // for query task 1230 if task.Query != nil { 1231 queryCompletedRequest := &s.RespondQueryTaskCompletedRequest{TaskToken: task.TaskToken} 1232 if panicErr, ok := workflowContext.err.(*PanicError); ok { 1233 // NOTE: this code path should never be executed, we should check for workflowPanicError instead of PanicError 1234 wth.logger.Warn("Encountered PanicError in workflow query task", 1235 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 1236 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 1237 zap.String(tagPanicError, panicErr.Error()), 1238 zap.String(tagPanicStack, panicErr.StackTrace()), 1239 ) 1240 1241 queryCompletedRequest.CompletedType = common.QueryTaskCompletedTypePtr(s.QueryTaskCompletedTypeFailed) 1242 queryCompletedRequest.ErrorMessage = common.StringPtr("Workflow panic: " + panicErr.Error()) 1243 return queryCompletedRequest 1244 } 1245 1246 if workflowPanicErr, ok := workflowContext.err.(*workflowPanicError); ok { 1247 // NOTE: in this case we should return complete query task with CompletedTypeFailed 1248 // but we didn't check for the right error type before, this may break existing customer 1249 wth.logger.Warn("Ignored workflow panic error for query, query result may be partial", 1250 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 1251 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 1252 zap.String(tagPanicError, workflowPanicErr.Error()), 1253 zap.String(tagPanicStack, workflowPanicErr.StackTrace()), 1254 zap.Int64("PreviousStartedEventID", task.GetPreviousStartedEventId()), 1255 ) 1256 } 1257 1258 result, err := eventHandler.ProcessQuery(task.Query.GetQueryType(), task.Query.QueryArgs) 1259 if err != nil { 1260 queryCompletedRequest.CompletedType = common.QueryTaskCompletedTypePtr(s.QueryTaskCompletedTypeFailed) 1261 queryCompletedRequest.ErrorMessage = common.StringPtr(err.Error()) 1262 } else { 1263 queryCompletedRequest.CompletedType = common.QueryTaskCompletedTypePtr(s.QueryTaskCompletedTypeCompleted) 1264 queryCompletedRequest.QueryResult = result 1265 } 1266 return queryCompletedRequest 1267 } 1268 1269 metricsScope := wth.metricsScope.GetTaggedScope(tagWorkflowType, eventHandler.workflowEnvironmentImpl.workflowInfo.WorkflowType.Name) 1270 1271 // fail decision task on decider panic 1272 if panicErr, ok := workflowContext.err.(*workflowPanicError); ok { 1273 // Workflow panic 1274 metricsScope.Counter(metrics.DecisionTaskPanicCounter).Inc(1) 1275 wth.logger.Error("Workflow panic.", 1276 zap.String(tagWorkflowType, task.WorkflowType.GetName()), 1277 zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()), 1278 zap.String(tagRunID, task.WorkflowExecution.GetRunId()), 1279 zap.String(tagPanicError, panicErr.Error()), 1280 zap.String(tagPanicStack, panicErr.StackTrace())) 1281 return errorToFailDecisionTask(task.TaskToken, panicErr, wth.identity) 1282 } 1283 1284 // complete decision task 1285 var closeDecision *s.Decision 1286 if canceledErr, ok := workflowContext.err.(*CanceledError); ok { 1287 // Workflow cancelled 1288 metricsScope.Counter(metrics.WorkflowCanceledCounter).Inc(1) 1289 closeDecision = createNewDecision(s.DecisionTypeCancelWorkflowExecution) 1290 _, details := getErrorDetails(canceledErr, wth.dataConverter) 1291 closeDecision.CancelWorkflowExecutionDecisionAttributes = &s.CancelWorkflowExecutionDecisionAttributes{ 1292 Details: details, 1293 } 1294 } else if contErr, ok := workflowContext.err.(*ContinueAsNewError); ok { 1295 // Continue as new error. 1296 metricsScope.Counter(metrics.WorkflowContinueAsNewCounter).Inc(1) 1297 closeDecision = createNewDecision(s.DecisionTypeContinueAsNewWorkflowExecution) 1298 closeDecision.ContinueAsNewWorkflowExecutionDecisionAttributes = &s.ContinueAsNewWorkflowExecutionDecisionAttributes{ 1299 WorkflowType: workflowTypePtr(*contErr.params.workflowType), 1300 Input: contErr.params.input, 1301 TaskList: common.TaskListPtr(s.TaskList{Name: contErr.params.taskListName}), 1302 ExecutionStartToCloseTimeoutSeconds: contErr.params.executionStartToCloseTimeoutSeconds, 1303 TaskStartToCloseTimeoutSeconds: contErr.params.taskStartToCloseTimeoutSeconds, 1304 Header: contErr.params.header, 1305 Memo: workflowContext.workflowInfo.Memo, 1306 SearchAttributes: workflowContext.workflowInfo.SearchAttributes, 1307 RetryPolicy: workflowContext.workflowInfo.RetryPolicy, 1308 } 1309 } else if workflowContext.err != nil { 1310 // Workflow failures 1311 metricsScope.Counter(metrics.WorkflowFailedCounter).Inc(1) 1312 closeDecision = createNewDecision(s.DecisionTypeFailWorkflowExecution) 1313 reason, details := getErrorDetails(workflowContext.err, wth.dataConverter) 1314 closeDecision.FailWorkflowExecutionDecisionAttributes = &s.FailWorkflowExecutionDecisionAttributes{ 1315 Reason: common.StringPtr(reason), 1316 Details: details, 1317 } 1318 } else if workflowContext.isWorkflowCompleted { 1319 // Workflow completion 1320 metricsScope.Counter(metrics.WorkflowCompletedCounter).Inc(1) 1321 closeDecision = createNewDecision(s.DecisionTypeCompleteWorkflowExecution) 1322 closeDecision.CompleteWorkflowExecutionDecisionAttributes = &s.CompleteWorkflowExecutionDecisionAttributes{ 1323 Result: workflowContext.result, 1324 } 1325 } 1326 1327 if closeDecision != nil { 1328 decisions = append(decisions, closeDecision) 1329 elapsed := time.Since(workflowContext.workflowStartTime) 1330 metricsScope.Timer(metrics.WorkflowEndToEndLatency).Record(elapsed) 1331 forceNewDecision = false 1332 } 1333 1334 var queryResults map[string]*s.WorkflowQueryResult 1335 if len(task.Queries) != 0 { 1336 queryResults = make(map[string]*s.WorkflowQueryResult) 1337 for queryID, query := range task.Queries { 1338 result, err := eventHandler.ProcessQuery(query.GetQueryType(), query.QueryArgs) 1339 if err != nil { 1340 queryResults[queryID] = &s.WorkflowQueryResult{ 1341 ResultType: common.QueryResultTypePtr(s.QueryResultTypeFailed), 1342 ErrorMessage: common.StringPtr(err.Error()), 1343 } 1344 } else { 1345 queryResults[queryID] = &s.WorkflowQueryResult{ 1346 ResultType: common.QueryResultTypePtr(s.QueryResultTypeAnswered), 1347 Answer: result, 1348 } 1349 } 1350 } 1351 } 1352 1353 return &s.RespondDecisionTaskCompletedRequest{ 1354 TaskToken: task.TaskToken, 1355 Decisions: decisions, 1356 Identity: common.StringPtr(wth.identity), 1357 ReturnNewDecisionTask: common.BoolPtr(true), 1358 ForceCreateNewDecisionTask: common.BoolPtr(forceNewDecision), 1359 BinaryChecksum: common.StringPtr(getBinaryChecksum()), 1360 QueryResults: queryResults, 1361 } 1362 } 1363 1364 func errorToFailDecisionTask(taskToken []byte, err error, identity string) *s.RespondDecisionTaskFailedRequest { 1365 failedCause := s.DecisionTaskFailedCauseWorkflowWorkerUnhandledFailure 1366 _, details := getErrorDetails(err, nil) 1367 return &s.RespondDecisionTaskFailedRequest{ 1368 TaskToken: taskToken, 1369 Cause: &failedCause, 1370 Details: details, 1371 Identity: common.StringPtr(identity), 1372 BinaryChecksum: common.StringPtr(getBinaryChecksum()), 1373 } 1374 } 1375 1376 func (wth *workflowTaskHandlerImpl) executeAnyPressurePoints(event *s.HistoryEvent, isInReplay bool) error { 1377 if wth.ppMgr != nil && !reflect.ValueOf(wth.ppMgr).IsNil() && !isInReplay { 1378 switch event.GetEventType() { 1379 case s.EventTypeDecisionTaskStarted: 1380 return wth.ppMgr.Execute(pressurePointTypeDecisionTaskStartTimeout) 1381 case s.EventTypeActivityTaskScheduled: 1382 return wth.ppMgr.Execute(pressurePointTypeActivityTaskScheduleTimeout) 1383 case s.EventTypeActivityTaskStarted: 1384 return wth.ppMgr.Execute(pressurePointTypeActivityTaskStartTimeout) 1385 case s.EventTypeDecisionTaskCompleted: 1386 return wth.ppMgr.Execute(pressurePointTypeDecisionTaskCompleted) 1387 } 1388 } 1389 return nil 1390 } 1391 1392 func newActivityTaskHandler( 1393 service workflowserviceclient.Interface, 1394 params workerExecutionParameters, 1395 registry *registry, 1396 ) ActivityTaskHandler { 1397 return newActivityTaskHandlerWithCustomProvider(service, params, registry, nil) 1398 } 1399 1400 func newActivityTaskHandlerWithCustomProvider( 1401 service workflowserviceclient.Interface, 1402 params workerExecutionParameters, 1403 registry *registry, 1404 activityProvider activityProvider, 1405 ) ActivityTaskHandler { 1406 return &activityTaskHandlerImpl{ 1407 taskListName: params.TaskList, 1408 identity: params.Identity, 1409 service: service, 1410 logger: params.Logger, 1411 metricsScope: metrics.NewTaggedScope(params.MetricsScope), 1412 userContext: params.UserContext, 1413 registry: registry, 1414 activityProvider: activityProvider, 1415 dataConverter: params.DataConverter, 1416 workerStopCh: params.WorkerStopChannel, 1417 contextPropagators: params.ContextPropagators, 1418 tracer: params.Tracer, 1419 featureFlags: params.FeatureFlags, 1420 } 1421 } 1422 1423 type cadenceInvoker struct { 1424 sync.Mutex 1425 identity string 1426 service workflowserviceclient.Interface 1427 taskToken []byte 1428 cancelHandler func() 1429 heartBeatTimeoutInSec int32 // The heart beat interval configured for this activity. 1430 hbBatchEndTimer *time.Timer // Whether we started a batch of operations that need to be reported in the cycle. This gets started on a user call. 1431 detailsToReport *[]byte // Details to be reported in the next reporting interval. 1432 lastDetailsReported *[]byte // Details that were reported in the last reporting interval. 1433 closeCh chan struct{} 1434 workerStopChannel <-chan struct{} 1435 featureFlags FeatureFlags 1436 logger *zap.Logger 1437 workflowType string 1438 activityType string 1439 } 1440 1441 func (i *cadenceInvoker) Heartbeat(details []byte) error { 1442 i.Lock() 1443 defer i.Unlock() 1444 1445 _, err := i.internalHeartBeat(details) 1446 return err 1447 } 1448 1449 func (i *cadenceInvoker) BackgroundHeartbeat() error { 1450 i.Lock() 1451 defer i.Unlock() 1452 1453 if i.hbBatchEndTimer != nil { 1454 if i.detailsToReport == nil { 1455 i.detailsToReport = i.lastDetailsReported 1456 } 1457 1458 return nil 1459 } 1460 1461 var details []byte 1462 if i.detailsToReport != nil { 1463 details = *i.detailsToReport 1464 } else if i.lastDetailsReported != nil { 1465 details = *i.lastDetailsReported 1466 } 1467 1468 return i.heartbeatAndScheduleNextRun(details) 1469 } 1470 1471 func (i *cadenceInvoker) BatchHeartbeat(details []byte) error { 1472 i.Lock() 1473 defer i.Unlock() 1474 1475 if i.hbBatchEndTimer != nil { 1476 // If we have started batching window, keep track of last reported progress. 1477 i.detailsToReport = &details 1478 return nil 1479 } 1480 1481 return i.heartbeatAndScheduleNextRun(details) 1482 } 1483 1484 func (i *cadenceInvoker) heartbeatAndScheduleNextRun(details []byte) error { 1485 isActivityCancelled, err := i.internalHeartBeat(details) 1486 1487 // If the activity is cancelled, the activity can ignore the cancellation and do its work 1488 // and complete. Our cancellation is co-operative, so we will try to heartbeat. 1489 if err == nil || isActivityCancelled { 1490 // We have successfully sent heartbeat, start next batching window. 1491 i.lastDetailsReported = &details 1492 i.detailsToReport = nil 1493 1494 // Create timer to fire before the threshold to report. 1495 deadlineToTrigger := i.heartBeatTimeoutInSec 1496 if deadlineToTrigger <= 0 { 1497 // If we don't have any heartbeat timeout configured. 1498 deadlineToTrigger = defaultHeartBeatIntervalInSec 1499 } 1500 1501 // We set a deadline at 80% of the timeout. 1502 duration := time.Duration(0.8*float32(deadlineToTrigger)) * time.Second 1503 i.hbBatchEndTimer = time.NewTimer(duration) 1504 1505 go func() { 1506 select { 1507 case <-i.hbBatchEndTimer.C: 1508 // We are close to deadline. 1509 case <-i.workerStopChannel: 1510 // Activity worker is close to stop. This does the same steps as batch timer ends. 1511 case <-i.closeCh: 1512 // We got closed. 1513 return 1514 } 1515 1516 // We close the batch and report the progress. 1517 var detailsToReport *[]byte 1518 1519 i.Lock() 1520 detailsToReport = i.detailsToReport 1521 i.hbBatchEndTimer.Stop() 1522 i.hbBatchEndTimer = nil 1523 1524 var err error 1525 if detailsToReport != nil { 1526 err = i.heartbeatAndScheduleNextRun(*detailsToReport) 1527 } 1528 i.Unlock() 1529 1530 // Log the error outside the lock. 1531 i.logFailedHeartBeat(err) 1532 }() 1533 } 1534 1535 return err 1536 } 1537 1538 func (i *cadenceInvoker) logFailedHeartBeat(err error) { 1539 // If the error is a canceled error do not log, as this is expected. 1540 var canceledErr *CanceledError 1541 1542 // We need to check for nil as errors.As returns false for nil. Which would cause us to log on nil. 1543 if err != nil && !errors.As(err, &canceledErr) { 1544 i.logger.Error("Failed to send heartbeat", zap.Error(err), zap.String(tagWorkflowType, i.workflowType), zap.String(tagActivityType, i.activityType)) 1545 } 1546 } 1547 1548 func (i *cadenceInvoker) internalHeartBeat(details []byte) (bool, error) { 1549 isActivityCancelled := false 1550 timeout := time.Duration(i.heartBeatTimeoutInSec) * time.Second 1551 if timeout <= 0 { 1552 timeout = time.Duration(defaultHeartBeatIntervalInSec) * time.Second 1553 } 1554 ctx, cancel := context.WithTimeout(context.Background(), timeout) 1555 defer cancel() 1556 1557 err := recordActivityHeartbeat(ctx, i.service, i.identity, i.taskToken, details, i.featureFlags) 1558 1559 switch err.(type) { 1560 case *CanceledError: 1561 // We are asked to cancel. inform the activity about cancellation through context. 1562 i.cancelHandler() 1563 isActivityCancelled = true 1564 1565 case *s.EntityNotExistsError, *s.WorkflowExecutionAlreadyCompletedError, *s.DomainNotActiveError: 1566 // We will pass these through as cancellation for now but something we can change 1567 // later when we have setter on cancel handler. 1568 i.cancelHandler() 1569 isActivityCancelled = true 1570 } 1571 1572 // We don't want to bubble temporary errors to the user. 1573 // This error won't be return to user check RecordActivityHeartbeat(). 1574 return isActivityCancelled, err 1575 } 1576 1577 func (i *cadenceInvoker) Close(flushBufferedHeartbeat bool) { 1578 i.Lock() 1579 defer i.Unlock() 1580 close(i.closeCh) 1581 if i.hbBatchEndTimer != nil { 1582 i.hbBatchEndTimer.Stop() 1583 if flushBufferedHeartbeat && i.detailsToReport != nil { 1584 i.internalHeartBeat(*i.detailsToReport) 1585 i.lastDetailsReported = i.detailsToReport 1586 i.detailsToReport = nil 1587 } 1588 } 1589 } 1590 1591 func (i *cadenceInvoker) SignalWorkflow(ctx context.Context, domain, workflowID, runID, signalName string, signalInput []byte) error { 1592 return signalWorkflow(ctx, i.service, i.identity, domain, workflowID, runID, signalName, signalInput, i.featureFlags) 1593 } 1594 1595 func newServiceInvoker( 1596 taskToken []byte, 1597 identity string, 1598 service workflowserviceclient.Interface, 1599 cancelHandler func(), 1600 heartBeatTimeoutInSec int32, 1601 workerStopChannel <-chan struct{}, 1602 featureFlags FeatureFlags, 1603 logger *zap.Logger, 1604 workflowType string, 1605 activityType string, 1606 ) ServiceInvoker { 1607 return &cadenceInvoker{ 1608 taskToken: taskToken, 1609 identity: identity, 1610 service: service, 1611 cancelHandler: cancelHandler, 1612 heartBeatTimeoutInSec: heartBeatTimeoutInSec, 1613 closeCh: make(chan struct{}), 1614 workerStopChannel: workerStopChannel, 1615 featureFlags: featureFlags, 1616 logger: logger, 1617 workflowType: workflowType, 1618 activityType: activityType, 1619 } 1620 } 1621 1622 // Execute executes an implementation of the activity. 1623 func (ath *activityTaskHandlerImpl) Execute(taskList string, t *s.PollForActivityTaskResponse) (result interface{}, err error) { 1624 traceLog(func() { 1625 ath.logger.Debug("Processing new activity task", 1626 zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()), 1627 zap.String(tagRunID, t.WorkflowExecution.GetRunId()), 1628 zap.String(tagActivityType, t.ActivityType.GetName())) 1629 }) 1630 1631 rootCtx := ath.userContext 1632 if rootCtx == nil { 1633 rootCtx = context.Background() 1634 } 1635 canCtx, cancel := context.WithCancel(rootCtx) 1636 defer cancel() 1637 1638 workflowType := t.WorkflowType.GetName() 1639 activityType := t.ActivityType.GetName() 1640 invoker := newServiceInvoker(t.TaskToken, ath.identity, ath.service, cancel, t.GetHeartbeatTimeoutSeconds(), ath.workerStopCh, ath.featureFlags, ath.logger, workflowType, activityType) 1641 defer func() { 1642 _, activityCompleted := result.(*s.RespondActivityTaskCompletedRequest) 1643 invoker.Close(!activityCompleted) // flush buffered heartbeat if activity was not successfully completed. 1644 }() 1645 1646 metricsScope := getMetricsScopeForActivity(ath.metricsScope, workflowType, activityType) 1647 ctx := WithActivityTask(canCtx, t, taskList, invoker, ath.logger, metricsScope, ath.dataConverter, ath.workerStopCh, ath.contextPropagators, ath.tracer) 1648 1649 activityImplementation := ath.getActivity(activityType) 1650 if activityImplementation == nil { 1651 // Couldn't find the activity implementation. 1652 supported := strings.Join(ath.getRegisteredActivityNames(), ", ") 1653 return nil, fmt.Errorf("unable to find activityType=%v. Supported types: [%v]", activityType, supported) 1654 } 1655 1656 // panic handler 1657 defer func() { 1658 if p := recover(); p != nil { 1659 topLine := fmt.Sprintf("activity for %s [panic]:", ath.taskListName) 1660 st := getStackTraceRaw(topLine, 7, 0) 1661 ath.logger.Error("Activity panic.", 1662 zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()), 1663 zap.String(tagRunID, t.WorkflowExecution.GetRunId()), 1664 zap.String(tagActivityType, activityType), 1665 zap.String(tagPanicError, fmt.Sprintf("%v", p)), 1666 zap.String(tagPanicStack, st)) 1667 metricsScope.Counter(metrics.ActivityTaskPanicCounter).Inc(1) 1668 panicErr := newPanicError(p, st) 1669 result, err = convertActivityResultToRespondRequest(ath.identity, t.TaskToken, nil, panicErr, ath.dataConverter), nil 1670 } 1671 }() 1672 1673 // propagate context information into the activity context from the headers 1674 for _, ctxProp := range ath.contextPropagators { 1675 var err error 1676 if ctx, err = ctxProp.Extract(ctx, NewHeaderReader(t.Header)); err != nil { 1677 return nil, fmt.Errorf("unable to propagate context %v", err) 1678 } 1679 } 1680 1681 info := ctx.Value(activityEnvContextKey).(*activityEnvironment) 1682 ctx, dlCancelFunc := context.WithDeadline(ctx, info.deadline) 1683 defer dlCancelFunc() 1684 1685 ctx, span := createOpenTracingActivitySpan(ctx, ath.tracer, time.Now(), activityType, t.WorkflowExecution.GetWorkflowId(), t.WorkflowExecution.GetRunId()) 1686 defer span.Finish() 1687 1688 if activityImplementation.GetOptions().EnableAutoHeartbeat && t.HeartbeatTimeoutSeconds != nil && *t.HeartbeatTimeoutSeconds > 0 { 1689 go func() { 1690 autoHbInterval := time.Duration(*t.HeartbeatTimeoutSeconds) * time.Second / 2 1691 ticker := time.NewTicker(autoHbInterval) 1692 defer ticker.Stop() 1693 for { 1694 select { 1695 case <-ath.workerStopCh: 1696 return 1697 case <-ctx.Done(): 1698 return 1699 case <-ticker.C: 1700 hbErr := invoker.BackgroundHeartbeat() 1701 if hbErr != nil && !IsCanceledError(hbErr) { 1702 ath.logger.Error("Activity auto heartbeat error.", 1703 zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()), 1704 zap.String(tagRunID, t.WorkflowExecution.GetRunId()), 1705 zap.String(tagActivityType, activityType), 1706 zap.Error(hbErr), 1707 ) 1708 } 1709 } 1710 } 1711 }() 1712 } 1713 1714 output, err := activityImplementation.Execute(ctx, t.Input) 1715 1716 dlCancelFunc() 1717 if <-ctx.Done(); ctx.Err() == context.DeadlineExceeded { 1718 ath.logger.Warn("Activity timeout.", 1719 zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()), 1720 zap.String(tagRunID, t.WorkflowExecution.GetRunId()), 1721 zap.String(tagActivityType, activityType), 1722 ) 1723 return nil, ctx.Err() 1724 } 1725 if err != nil && err != ErrActivityResultPending { 1726 ath.logger.Error("Activity error.", 1727 zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()), 1728 zap.String(tagRunID, t.WorkflowExecution.GetRunId()), 1729 zap.String(tagActivityType, activityType), 1730 zap.Error(err), 1731 ) 1732 } 1733 return convertActivityResultToRespondRequest(ath.identity, t.TaskToken, output, err, ath.dataConverter), nil 1734 } 1735 1736 func (ath *activityTaskHandlerImpl) getActivity(name string) activity { 1737 if ath.activityProvider != nil { 1738 return ath.activityProvider(name) 1739 } 1740 1741 if a, ok := ath.registry.GetActivity(name); ok { 1742 return a 1743 } 1744 1745 return nil 1746 } 1747 1748 func (ath *activityTaskHandlerImpl) getRegisteredActivityNames() (activityNames []string) { 1749 for _, a := range ath.registry.getRegisteredActivities() { 1750 activityNames = append(activityNames, a.ActivityType().Name) 1751 } 1752 return 1753 } 1754 1755 func createNewDecision(decisionType s.DecisionType) *s.Decision { 1756 return &s.Decision{ 1757 DecisionType: common.DecisionTypePtr(decisionType), 1758 } 1759 } 1760 func signalWorkflow( 1761 ctx context.Context, 1762 service workflowserviceclient.Interface, 1763 identity string, 1764 domain string, 1765 workflowID string, 1766 runID string, 1767 signalName string, 1768 signalInput []byte, 1769 featureFlags FeatureFlags, 1770 ) error { 1771 request := &s.SignalWorkflowExecutionRequest{ 1772 Domain: common.StringPtr(domain), 1773 WorkflowExecution: &s.WorkflowExecution{ 1774 WorkflowId: common.StringPtr(workflowID), 1775 RunId: getRunID(runID), 1776 }, 1777 SignalName: common.StringPtr(signalName), 1778 Input: signalInput, 1779 Identity: common.StringPtr(identity), 1780 } 1781 1782 return backoff.Retry(ctx, 1783 func() error { 1784 tchCtx, cancel, opt := newChannelContext(ctx, featureFlags) 1785 defer cancel() 1786 return service.SignalWorkflowExecution(tchCtx, request, opt...) 1787 }, createDynamicServiceRetryPolicy(ctx), isServiceTransientError) 1788 } 1789 1790 func recordActivityHeartbeat( 1791 ctx context.Context, 1792 service workflowserviceclient.Interface, 1793 identity string, 1794 taskToken, details []byte, 1795 featureFlags FeatureFlags, 1796 ) error { 1797 request := &s.RecordActivityTaskHeartbeatRequest{ 1798 TaskToken: taskToken, 1799 Details: details, 1800 Identity: common.StringPtr(identity)} 1801 1802 var heartbeatResponse *s.RecordActivityTaskHeartbeatResponse 1803 heartbeatErr := backoff.Retry(ctx, 1804 func() error { 1805 tchCtx, cancel, opt := newChannelContext(ctx, featureFlags) 1806 defer cancel() 1807 1808 var err error 1809 heartbeatResponse, err = service.RecordActivityTaskHeartbeat(tchCtx, request, opt...) 1810 return err 1811 }, createDynamicServiceRetryPolicy(ctx), isServiceTransientError) 1812 1813 if heartbeatErr == nil && heartbeatResponse != nil && heartbeatResponse.GetCancelRequested() { 1814 return NewCanceledError() 1815 } 1816 1817 return heartbeatErr 1818 } 1819 1820 func recordActivityHeartbeatByID( 1821 ctx context.Context, 1822 service workflowserviceclient.Interface, 1823 identity string, 1824 domain, workflowID, runID, activityID string, 1825 details []byte, 1826 featureFlags FeatureFlags, 1827 ) error { 1828 request := &s.RecordActivityTaskHeartbeatByIDRequest{ 1829 Domain: common.StringPtr(domain), 1830 WorkflowID: common.StringPtr(workflowID), 1831 RunID: common.StringPtr(runID), 1832 ActivityID: common.StringPtr(activityID), 1833 Details: details, 1834 Identity: common.StringPtr(identity)} 1835 1836 var heartbeatResponse *s.RecordActivityTaskHeartbeatResponse 1837 heartbeatErr := backoff.Retry(ctx, 1838 func() error { 1839 tchCtx, cancel, opt := newChannelContext(ctx, featureFlags) 1840 defer cancel() 1841 1842 var err error 1843 heartbeatResponse, err = service.RecordActivityTaskHeartbeatByID(tchCtx, request, opt...) 1844 return err 1845 }, createDynamicServiceRetryPolicy(ctx), isServiceTransientError) 1846 1847 if heartbeatErr == nil && heartbeatResponse != nil && heartbeatResponse.GetCancelRequested() { 1848 return NewCanceledError() 1849 } 1850 1851 return heartbeatErr 1852 } 1853 1854 // This enables verbose logging in the client library. 1855 // check worker.EnableVerboseLogging() 1856 func traceLog(fn func()) { 1857 if enableVerboseLogging { 1858 fn() 1859 } 1860 } 1861 1862 func workflowCategorizedByTimeout(wfContext *workflowExecutionContextImpl) string { 1863 executionTimeout := wfContext.workflowInfo.ExecutionStartToCloseTimeoutSeconds 1864 if executionTimeout <= defaultInstantLivedWorkflowTimeoutUpperLimitInSec { 1865 return "instant" 1866 } else if executionTimeout <= defaultShortLivedWorkflowTimeoutUpperLimitInSec { 1867 return "short" 1868 } else if executionTimeout <= defaultMediumLivedWorkflowTimeoutUpperLimitInSec { 1869 return "intermediate" 1870 } else { 1871 return "long" 1872 } 1873 }