go.uber.org/cadence@v1.2.9/internal/internal_task_handlers.go (about)

     1  // Copyright (c) 2017-2020 Uber Technologies Inc.
     2  // Portions of the Software are attributed to Copyright (c) 2020 Temporal Technologies Inc.
     3  //
     4  // Permission is hereby granted, free of charge, to any person obtaining a copy
     5  // of this software and associated documentation files (the "Software"), to deal
     6  // in the Software without restriction, including without limitation the rights
     7  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     8  // copies of the Software, and to permit persons to whom the Software is
     9  // furnished to do so, subject to the following conditions:
    10  //
    11  // The above copyright notice and this permission notice shall be included in
    12  // all copies or substantial portions of the Software.
    13  //
    14  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    16  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    17  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    18  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    19  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20  // THE SOFTWARE.
    21  
    22  package internal
    23  
    24  // All code in this file is private to the package.
    25  
    26  import (
    27  	"context"
    28  	"errors"
    29  	"fmt"
    30  	"math"
    31  	"reflect"
    32  	"strings"
    33  	"sync"
    34  	"sync/atomic"
    35  	"time"
    36  
    37  	"github.com/opentracing/opentracing-go"
    38  	"github.com/uber-go/tally"
    39  	"go.uber.org/zap"
    40  
    41  	"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
    42  	s "go.uber.org/cadence/.gen/go/shared"
    43  	"go.uber.org/cadence/internal/common"
    44  	"go.uber.org/cadence/internal/common/backoff"
    45  	"go.uber.org/cadence/internal/common/cache"
    46  	"go.uber.org/cadence/internal/common/metrics"
    47  )
    48  
    49  const (
    50  	defaultHeartBeatIntervalInSec = 10 * 60
    51  
    52  	defaultStickyCacheSize = 10000
    53  
    54  	noRetryBackoff = time.Duration(-1)
    55  
    56  	defaultInstantLivedWorkflowTimeoutUpperLimitInSec = 1
    57  
    58  	defaultShortLivedWorkflowTimeoutUpperLimitInSec = 1 * 1800
    59  
    60  	defaultMediumLivedWorkflowTimeoutUpperLimitInSec = 8 * 3600
    61  )
    62  
    63  type (
    64  	// workflowExecutionEventHandler process a single event.
    65  	workflowExecutionEventHandler interface {
    66  		// Process a single event and return the assosciated decisions.
    67  		// Return List of decisions made, any error.
    68  		ProcessEvent(event *s.HistoryEvent, isReplay bool, isLast bool) error
    69  		// ProcessQuery process a query request.
    70  		ProcessQuery(queryType string, queryArgs []byte) ([]byte, error)
    71  		StackTrace() string
    72  		// Close for cleaning up resources on this event handler
    73  		Close()
    74  	}
    75  
    76  	// workflowTask wraps a decision task.
    77  	workflowTask struct {
    78  		task            *s.PollForDecisionTaskResponse
    79  		historyIterator HistoryIterator
    80  		doneCh          chan struct{}
    81  		laResultCh      chan *localActivityResult
    82  	}
    83  
    84  	// activityTask wraps a activity task.
    85  	activityTask struct {
    86  		task          *s.PollForActivityTaskResponse
    87  		pollStartTime time.Time
    88  	}
    89  
    90  	// resetStickinessTask wraps a ResetStickyTaskListRequest.
    91  	resetStickinessTask struct {
    92  		task *s.ResetStickyTaskListRequest
    93  	}
    94  
    95  	// workflowExecutionContextImpl is the cached workflow state for sticky execution
    96  	workflowExecutionContextImpl struct {
    97  		mutex             sync.Mutex
    98  		workflowStartTime time.Time
    99  		workflowInfo      *WorkflowInfo
   100  		wth               *workflowTaskHandlerImpl
   101  
   102  		// eventHandler is changed to a atomic.Value as a temporally bug fix for local activity
   103  		// retry issue (github issue #915). Therefore, when accessing/modifying this field, the
   104  		// mutex should still be held.
   105  		eventHandler atomic.Value
   106  
   107  		isWorkflowCompleted bool
   108  		result              []byte
   109  		err                 error
   110  
   111  		previousStartedEventID int64
   112  
   113  		newDecisions        []*s.Decision
   114  		currentDecisionTask *s.PollForDecisionTaskResponse
   115  		laTunnel            *localActivityTunnel
   116  		decisionStartTime   time.Time
   117  	}
   118  
   119  	// workflowTaskHandlerImpl is the implementation of WorkflowTaskHandler
   120  	workflowTaskHandlerImpl struct {
   121  		domain                         string
   122  		metricsScope                   *metrics.TaggedScope
   123  		ppMgr                          pressurePointMgr
   124  		logger                         *zap.Logger
   125  		identity                       string
   126  		enableLoggingInReplay          bool
   127  		disableStickyExecution         bool
   128  		registry                       *registry
   129  		laTunnel                       *localActivityTunnel
   130  		nonDeterministicWorkflowPolicy NonDeterministicWorkflowPolicy
   131  		dataConverter                  DataConverter
   132  		contextPropagators             []ContextPropagator
   133  		tracer                         opentracing.Tracer
   134  		workflowInterceptorFactories   []WorkflowInterceptorFactory
   135  		disableStrictNonDeterminism    bool
   136  	}
   137  
   138  	activityProvider func(name string) activity
   139  
   140  	// activityTaskHandlerImpl is the implementation of ActivityTaskHandler
   141  	activityTaskHandlerImpl struct {
   142  		taskListName       string
   143  		identity           string
   144  		service            workflowserviceclient.Interface
   145  		metricsScope       *metrics.TaggedScope
   146  		logger             *zap.Logger
   147  		userContext        context.Context
   148  		registry           *registry
   149  		activityProvider   activityProvider
   150  		dataConverter      DataConverter
   151  		workerStopCh       <-chan struct{}
   152  		contextPropagators []ContextPropagator
   153  		tracer             opentracing.Tracer
   154  		featureFlags       FeatureFlags
   155  	}
   156  
   157  	// history wrapper method to help information about events.
   158  	history struct {
   159  		workflowTask   *workflowTask
   160  		eventsHandler  *workflowExecutionEventHandlerImpl
   161  		loadedEvents   []*s.HistoryEvent
   162  		currentIndex   int
   163  		nextEventID    int64 // next expected eventID for sanity
   164  		lastEventID    int64 // last expected eventID, zero indicates read until end of stream
   165  		next           []*s.HistoryEvent
   166  		binaryChecksum *string
   167  	}
   168  
   169  	decisionHeartbeatError struct {
   170  		Message string
   171  	}
   172  )
   173  
   174  func newHistory(task *workflowTask, eventsHandler *workflowExecutionEventHandlerImpl) *history {
   175  	result := &history{
   176  		workflowTask:  task,
   177  		eventsHandler: eventsHandler,
   178  		loadedEvents:  task.task.History.Events,
   179  		currentIndex:  0,
   180  		// don't set lastEventID to task.GetNextEventId()
   181  		// as for sticky query, the history in workflow task will be empty
   182  		// and query will be run based on existing workflow state.
   183  		// so the sanity check in verifyAllEventsProcessed will fail
   184  		lastEventID: task.task.GetStartedEventId(),
   185  	}
   186  	if len(result.loadedEvents) > 0 {
   187  		result.nextEventID = result.loadedEvents[0].GetEventId()
   188  	}
   189  	return result
   190  }
   191  
   192  func (e decisionHeartbeatError) Error() string {
   193  	return e.Message
   194  }
   195  
   196  // Get workflow start event.
   197  func (eh *history) GetWorkflowStartedEvent() (*s.HistoryEvent, error) {
   198  	events := eh.workflowTask.task.History.Events
   199  	if len(events) == 0 || events[0].GetEventType() != s.EventTypeWorkflowExecutionStarted {
   200  		return nil, errors.New("unable to find WorkflowExecutionStartedEventAttributes in the history")
   201  	}
   202  	return events[0], nil
   203  }
   204  
   205  func (eh *history) IsReplayEvent(event *s.HistoryEvent) bool {
   206  	return event.GetEventId() <= eh.workflowTask.task.GetPreviousStartedEventId() || isDecisionEvent(event.GetEventType())
   207  }
   208  
   209  func (eh *history) IsNextDecisionFailed() (isFailed bool, binaryChecksum *string, err error) {
   210  
   211  	nextIndex := eh.currentIndex + 1
   212  	if nextIndex >= len(eh.loadedEvents) && eh.hasMoreEvents() { // current page ends and there is more pages
   213  		if err := eh.loadMoreEvents(); err != nil {
   214  			return false, nil, err
   215  		}
   216  	}
   217  
   218  	if nextIndex < len(eh.loadedEvents) {
   219  		nextEvent := eh.loadedEvents[nextIndex]
   220  		nextEventType := nextEvent.GetEventType()
   221  		isFailed := nextEventType == s.EventTypeDecisionTaskTimedOut || nextEventType == s.EventTypeDecisionTaskFailed
   222  		var binaryChecksum *string
   223  		if nextEventType == s.EventTypeDecisionTaskCompleted {
   224  			binaryChecksum = nextEvent.DecisionTaskCompletedEventAttributes.BinaryChecksum
   225  		}
   226  		return isFailed, binaryChecksum, nil
   227  	}
   228  	return false, nil, nil
   229  }
   230  
   231  func (eh *history) loadMoreEvents() error {
   232  	historyPage, err := eh.getMoreEvents()
   233  	if err != nil {
   234  		return err
   235  	}
   236  	eh.loadedEvents = append(eh.loadedEvents, historyPage.Events...)
   237  	if eh.nextEventID == 0 && len(eh.loadedEvents) > 0 {
   238  		eh.nextEventID = eh.loadedEvents[0].GetEventId()
   239  	}
   240  	return nil
   241  }
   242  
   243  func isDecisionEvent(eventType s.EventType) bool {
   244  	switch eventType {
   245  	case s.EventTypeWorkflowExecutionCompleted,
   246  		s.EventTypeWorkflowExecutionFailed,
   247  		s.EventTypeWorkflowExecutionCanceled,
   248  		s.EventTypeWorkflowExecutionContinuedAsNew,
   249  		s.EventTypeActivityTaskScheduled,
   250  		s.EventTypeActivityTaskCancelRequested,
   251  		s.EventTypeTimerStarted,
   252  		s.EventTypeTimerCanceled,
   253  		s.EventTypeCancelTimerFailed,
   254  		s.EventTypeMarkerRecorded,
   255  		s.EventTypeStartChildWorkflowExecutionInitiated,
   256  		s.EventTypeRequestCancelExternalWorkflowExecutionInitiated,
   257  		s.EventTypeSignalExternalWorkflowExecutionInitiated,
   258  		s.EventTypeUpsertWorkflowSearchAttributes:
   259  		return true
   260  	default:
   261  		return false
   262  	}
   263  }
   264  
   265  // NextDecisionEvents returns events that there processed as new by the next decision.
   266  // TODO(maxim): Refactor to return a struct instead of multiple parameters
   267  func (eh *history) NextDecisionEvents() (result []*s.HistoryEvent, markers []*s.HistoryEvent, binaryChecksum *string, err error) {
   268  	if eh.next == nil {
   269  		eh.next, _, err = eh.nextDecisionEvents()
   270  		if err != nil {
   271  			return result, markers, eh.binaryChecksum, err
   272  		}
   273  	}
   274  
   275  	result = eh.next
   276  	checksum := eh.binaryChecksum
   277  	if len(result) > 0 {
   278  		eh.next, markers, err = eh.nextDecisionEvents()
   279  	}
   280  	return result, markers, checksum, err
   281  }
   282  
   283  func (eh *history) HasNextDecisionEvents() bool {
   284  	return len(eh.next) != 0 || eh.currentIndex != len(eh.loadedEvents) || eh.hasMoreEvents()
   285  }
   286  
   287  func (eh *history) hasMoreEvents() bool {
   288  	historyIterator := eh.workflowTask.historyIterator
   289  	return historyIterator != nil && historyIterator.HasNextPage()
   290  }
   291  
   292  func (eh *history) getMoreEvents() (*s.History, error) {
   293  	return eh.workflowTask.historyIterator.GetNextPage()
   294  }
   295  
   296  func (eh *history) verifyAllEventsProcessed() error {
   297  	if eh.lastEventID > 0 && eh.nextEventID <= eh.lastEventID {
   298  		return fmt.Errorf(
   299  			"history_events: premature end of stream, expectedLastEventID=%v but no more events after eventID=%v",
   300  			eh.lastEventID,
   301  			eh.nextEventID-1)
   302  	}
   303  	if eh.lastEventID > 0 && eh.nextEventID != (eh.lastEventID+1) {
   304  		eh.eventsHandler.logger.Warn(
   305  			"history_events: processed events past the expected lastEventID",
   306  			zap.Int64("expectedLastEventID", eh.lastEventID),
   307  			zap.Int64("processedLastEventID", eh.nextEventID-1))
   308  	}
   309  	return nil
   310  }
   311  
   312  func (eh *history) nextDecisionEvents() (nextEvents []*s.HistoryEvent, markers []*s.HistoryEvent, err error) {
   313  	if eh.currentIndex == len(eh.loadedEvents) && !eh.hasMoreEvents() {
   314  		if err := eh.verifyAllEventsProcessed(); err != nil {
   315  			return nil, nil, err
   316  		}
   317  		return []*s.HistoryEvent{}, []*s.HistoryEvent{}, nil
   318  	}
   319  
   320  	// Process events
   321  
   322  OrderEvents:
   323  	for {
   324  		// load more history events if needed
   325  		for eh.currentIndex == len(eh.loadedEvents) {
   326  			if !eh.hasMoreEvents() {
   327  				if err = eh.verifyAllEventsProcessed(); err != nil {
   328  					return
   329  				}
   330  				break OrderEvents
   331  			}
   332  			if err = eh.loadMoreEvents(); err != nil {
   333  				return
   334  			}
   335  		}
   336  
   337  		event := eh.loadedEvents[eh.currentIndex]
   338  		eventID := event.GetEventId()
   339  		if eventID != eh.nextEventID {
   340  			err = fmt.Errorf(
   341  				"missing history events, expectedNextEventID=%v but receivedNextEventID=%v",
   342  				eh.nextEventID, eventID)
   343  			return
   344  		}
   345  
   346  		eh.nextEventID++
   347  
   348  		switch event.GetEventType() {
   349  		case s.EventTypeDecisionTaskStarted:
   350  			isFailed, binaryChecksum, err1 := eh.IsNextDecisionFailed()
   351  			if err1 != nil {
   352  				err = err1
   353  				return
   354  			}
   355  			if !isFailed {
   356  				eh.binaryChecksum = binaryChecksum
   357  				eh.currentIndex++
   358  				nextEvents = append(nextEvents, event)
   359  				break OrderEvents
   360  			}
   361  		case s.EventTypeDecisionTaskScheduled,
   362  			s.EventTypeDecisionTaskTimedOut,
   363  			s.EventTypeDecisionTaskFailed:
   364  			// Skip
   365  		default:
   366  			if isPreloadMarkerEvent(event) {
   367  				markers = append(markers, event)
   368  			}
   369  			nextEvents = append(nextEvents, event)
   370  		}
   371  		eh.currentIndex++
   372  	}
   373  
   374  	// shrink loaded events so it can be GCed
   375  	eh.loadedEvents = eh.loadedEvents[eh.currentIndex:]
   376  	eh.currentIndex = 0
   377  
   378  	return nextEvents, markers, nil
   379  }
   380  
   381  func isPreloadMarkerEvent(event *s.HistoryEvent) bool {
   382  	return event.GetEventType() == s.EventTypeMarkerRecorded
   383  }
   384  
   385  // newWorkflowTaskHandler returns an implementation of workflow task handler.
   386  func newWorkflowTaskHandler(
   387  	domain string,
   388  	params workerExecutionParameters,
   389  	ppMgr pressurePointMgr,
   390  	registry *registry,
   391  ) WorkflowTaskHandler {
   392  	ensureRequiredParams(&params)
   393  	wth := &workflowTaskHandlerImpl{
   394  		domain:                         domain,
   395  		logger:                         params.Logger,
   396  		ppMgr:                          ppMgr,
   397  		metricsScope:                   metrics.NewTaggedScope(params.MetricsScope),
   398  		identity:                       params.Identity,
   399  		enableLoggingInReplay:          params.EnableLoggingInReplay,
   400  		disableStickyExecution:         params.DisableStickyExecution,
   401  		registry:                       registry,
   402  		nonDeterministicWorkflowPolicy: params.NonDeterministicWorkflowPolicy,
   403  		dataConverter:                  params.DataConverter,
   404  		contextPropagators:             params.ContextPropagators,
   405  		tracer:                         params.Tracer,
   406  		workflowInterceptorFactories:   params.WorkflowInterceptorChainFactories,
   407  		disableStrictNonDeterminism:    params.WorkerBugPorts.DisableStrictNonDeterminismCheck,
   408  	}
   409  
   410  	traceLog(func() {
   411  		wth.logger.Debug("Workflow task handler is created.",
   412  			zap.String(tagDomain, wth.domain),
   413  			zap.Bool("disableStrictNonDeterminism", wth.disableStrictNonDeterminism))
   414  	})
   415  
   416  	return wth
   417  }
   418  
   419  // TODO: need a better eviction policy based on memory usage
   420  var workflowCache cache.Cache
   421  var stickyCacheSize = defaultStickyCacheSize
   422  var initCacheOnce sync.Once
   423  var stickyCacheLock sync.Mutex
   424  
   425  // SetStickyWorkflowCacheSize sets the cache size for sticky workflow cache. Sticky workflow execution is the affinity
   426  // between decision tasks of a specific workflow execution to a specific worker. The affinity is set if sticky execution
   427  // is enabled via Worker.Options (It is enabled by default unless disabled explicitly). The benefit of sticky execution
   428  // is that workflow does not have to reconstruct the state by replaying from beginning of history events. But the cost
   429  // is it consumes more memory as it rely on caching workflow execution's running state on the worker. The cache is shared
   430  // between workers running within same process. This must be called before any worker is started. If not called, the
   431  // default size of 10K (might change in future) will be used.
   432  func SetStickyWorkflowCacheSize(cacheSize int) {
   433  	stickyCacheLock.Lock()
   434  	defer stickyCacheLock.Unlock()
   435  	if workflowCache != nil {
   436  		panic("cache already created, please set cache size before worker starts.")
   437  	}
   438  	stickyCacheSize = cacheSize
   439  }
   440  
   441  func getWorkflowCache() cache.Cache {
   442  	initCacheOnce.Do(func() {
   443  		stickyCacheLock.Lock()
   444  		defer stickyCacheLock.Unlock()
   445  		workflowCache = cache.New(stickyCacheSize, &cache.Options{
   446  			RemovedFunc: func(cachedEntity interface{}) {
   447  				wc := cachedEntity.(*workflowExecutionContextImpl)
   448  				wc.onEviction()
   449  			},
   450  		})
   451  	})
   452  	return workflowCache
   453  }
   454  
   455  func getWorkflowContext(runID string) *workflowExecutionContextImpl {
   456  	o := getWorkflowCache().Get(runID)
   457  	if o == nil {
   458  		return nil
   459  	}
   460  	wc := o.(*workflowExecutionContextImpl)
   461  	return wc
   462  }
   463  
   464  func putWorkflowContext(runID string, wc *workflowExecutionContextImpl) (*workflowExecutionContextImpl, error) {
   465  	existing, err := getWorkflowCache().PutIfNotExist(runID, wc)
   466  	if err != nil {
   467  		return nil, err
   468  	}
   469  	return existing.(*workflowExecutionContextImpl), nil
   470  }
   471  
   472  func removeWorkflowContext(runID string) {
   473  	getWorkflowCache().Delete(runID)
   474  }
   475  
   476  func newWorkflowExecutionContext(
   477  	startTime time.Time,
   478  	workflowInfo *WorkflowInfo,
   479  	taskHandler *workflowTaskHandlerImpl,
   480  ) *workflowExecutionContextImpl {
   481  	workflowContext := &workflowExecutionContextImpl{
   482  		workflowStartTime: startTime,
   483  		workflowInfo:      workflowInfo,
   484  		wth:               taskHandler,
   485  	}
   486  	workflowContext.createEventHandler()
   487  	return workflowContext
   488  }
   489  
   490  func (w *workflowExecutionContextImpl) Lock() {
   491  	w.mutex.Lock()
   492  }
   493  
   494  func (w *workflowExecutionContextImpl) Unlock(err error) {
   495  	cleared := false
   496  	cached := getWorkflowCache().Exist(w.workflowInfo.WorkflowExecution.RunID)
   497  	if err != nil || w.err != nil || w.isWorkflowCompleted || (w.wth.disableStickyExecution && !w.hasPendingLocalActivityWork()) {
   498  		// TODO: in case of closed, it assumes the close decision always succeed. need server side change to return
   499  		// error to indicate the close failure case. This should be rare case. For now, always remove the cache, and
   500  		// if the close decision failed, the next decision will have to rebuild the state.
   501  		if cached {
   502  			// also clears state asynchronously via cache eviction
   503  			removeWorkflowContext(w.workflowInfo.WorkflowExecution.RunID)
   504  		} else {
   505  			w.clearState()
   506  		}
   507  		cleared = true
   508  	}
   509  	// there are a variety of reasons a workflow may not have been put into the cache.
   510  	// all of them mean we need to clear the state at this point, or any running goroutines will be orphaned.
   511  	if !cleared && !cached {
   512  		w.clearState()
   513  	}
   514  
   515  	w.mutex.Unlock()
   516  }
   517  
   518  func (w *workflowExecutionContextImpl) getEventHandler() *workflowExecutionEventHandlerImpl {
   519  	eventHandler := w.eventHandler.Load()
   520  	if eventHandler == nil {
   521  		return nil
   522  	}
   523  	eventHandlerImpl, ok := eventHandler.(*workflowExecutionEventHandlerImpl)
   524  	if !ok {
   525  		panic("unknown type for workflow execution event handler")
   526  	}
   527  	return eventHandlerImpl
   528  }
   529  
   530  func (w *workflowExecutionContextImpl) completeWorkflow(result []byte, err error) {
   531  	w.isWorkflowCompleted = true
   532  	w.result = result
   533  	w.err = err
   534  }
   535  
   536  func (w *workflowExecutionContextImpl) shouldResetStickyOnEviction() bool {
   537  	// Not all evictions from the cache warrant a call to the server
   538  	// to reset stickiness.
   539  	// Cases when this is redundant or unnecessary include
   540  	// when an error was encountered during execution
   541  	// or workflow simply completed successfully.
   542  	return w.err == nil && !w.isWorkflowCompleted
   543  }
   544  
   545  func (w *workflowExecutionContextImpl) onEviction() {
   546  	// onEviction is run by LRU cache's removeFunc in separate goroutinue
   547  	w.mutex.Lock()
   548  
   549  	// Queue a ResetStickiness request *BEFORE* calling clearState
   550  	// because once destroyed, no sensible information
   551  	// may be ascertained about the execution context's state,
   552  	// nor should any of its methods be invoked.
   553  	if w.shouldResetStickyOnEviction() {
   554  		w.queueResetStickinessTask()
   555  	}
   556  
   557  	w.clearState()
   558  	w.mutex.Unlock()
   559  }
   560  
   561  func (w *workflowExecutionContextImpl) IsDestroyed() bool {
   562  	return w.getEventHandler() == nil
   563  }
   564  
   565  func (w *workflowExecutionContextImpl) queueResetStickinessTask() {
   566  	var task resetStickinessTask
   567  	task.task = &s.ResetStickyTaskListRequest{
   568  		Domain: common.StringPtr(w.workflowInfo.Domain),
   569  		Execution: &s.WorkflowExecution{
   570  			WorkflowId: common.StringPtr(w.workflowInfo.WorkflowExecution.ID),
   571  			RunId:      common.StringPtr(w.workflowInfo.WorkflowExecution.RunID),
   572  		},
   573  	}
   574  	// w.laTunnel could be nil for worker.ReplayHistory() because there is no worker started, in that case we don't
   575  	// care about resetStickinessTask.
   576  	if w.laTunnel != nil && w.laTunnel.resultCh != nil {
   577  		w.laTunnel.resultCh <- &task
   578  	}
   579  }
   580  
   581  func (w *workflowExecutionContextImpl) clearState() {
   582  	w.clearCurrentTask()
   583  	w.isWorkflowCompleted = false
   584  	w.result = nil
   585  	w.err = nil
   586  	w.previousStartedEventID = 0
   587  	w.newDecisions = nil
   588  
   589  	eventHandler := w.getEventHandler()
   590  	if eventHandler != nil {
   591  		// Set isReplay to true to prevent user code in defer guarded by !isReplaying() from running
   592  		eventHandler.isReplay = true
   593  		eventHandler.Close()
   594  		w.eventHandler.Store((*workflowExecutionEventHandlerImpl)(nil))
   595  	}
   596  }
   597  
   598  func (w *workflowExecutionContextImpl) createEventHandler() {
   599  	w.clearState()
   600  	eventHandler := newWorkflowExecutionEventHandler(
   601  		w.workflowInfo,
   602  		w.completeWorkflow,
   603  		w.wth.logger,
   604  		w.wth.enableLoggingInReplay,
   605  		w.wth.metricsScope,
   606  		w.wth.registry,
   607  		w.wth.dataConverter,
   608  		w.wth.contextPropagators,
   609  		w.wth.tracer,
   610  		w.wth.workflowInterceptorFactories,
   611  	)
   612  	w.eventHandler.Store(eventHandler)
   613  }
   614  
   615  func resetHistory(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) (*s.History, error) {
   616  	historyIterator.Reset()
   617  	firstPageHistory, err := historyIterator.GetNextPage()
   618  	if err != nil {
   619  		return nil, err
   620  	}
   621  	task.History = firstPageHistory
   622  	return firstPageHistory, nil
   623  }
   624  
   625  func (wth *workflowTaskHandlerImpl) createWorkflowContext(task *s.PollForDecisionTaskResponse) (*workflowExecutionContextImpl, error) {
   626  	h := task.History
   627  	attributes := h.Events[0].WorkflowExecutionStartedEventAttributes
   628  	if attributes == nil {
   629  		return nil, errors.New("first history event is not WorkflowExecutionStarted")
   630  	}
   631  	taskList := attributes.TaskList
   632  	if taskList == nil {
   633  		return nil, errors.New("nil TaskList in WorkflowExecutionStarted event")
   634  	}
   635  
   636  	runID := task.WorkflowExecution.GetRunId()
   637  	workflowID := task.WorkflowExecution.GetWorkflowId()
   638  
   639  	// Setup workflow Info
   640  	var parentWorkflowExecution *WorkflowExecution
   641  	if attributes.ParentWorkflowExecution != nil {
   642  		parentWorkflowExecution = &WorkflowExecution{
   643  			ID:    attributes.ParentWorkflowExecution.GetWorkflowId(),
   644  			RunID: attributes.ParentWorkflowExecution.GetRunId(),
   645  		}
   646  	}
   647  	workflowInfo := &WorkflowInfo{
   648  		WorkflowExecution: WorkflowExecution{
   649  			ID:    workflowID,
   650  			RunID: runID,
   651  		},
   652  		OriginalRunId:                       attributes.GetOriginalExecutionRunId(),
   653  		WorkflowType:                        flowWorkflowTypeFrom(*task.WorkflowType),
   654  		TaskListName:                        taskList.GetName(),
   655  		ExecutionStartToCloseTimeoutSeconds: attributes.GetExecutionStartToCloseTimeoutSeconds(),
   656  		TaskStartToCloseTimeoutSeconds:      attributes.GetTaskStartToCloseTimeoutSeconds(),
   657  		Domain:                              wth.domain,
   658  		Attempt:                             attributes.GetAttempt(),
   659  		lastCompletionResult:                attributes.LastCompletionResult,
   660  		CronSchedule:                        attributes.CronSchedule,
   661  		ContinuedExecutionRunID:             attributes.ContinuedExecutionRunId,
   662  		ParentWorkflowDomain:                attributes.ParentWorkflowDomain,
   663  		ParentWorkflowExecution:             parentWorkflowExecution,
   664  		Memo:                                attributes.Memo,
   665  		SearchAttributes:                    attributes.SearchAttributes,
   666  		RetryPolicy:                         attributes.RetryPolicy,
   667  	}
   668  
   669  	wfStartTime := time.Unix(0, h.Events[0].GetTimestamp())
   670  	return newWorkflowExecutionContext(wfStartTime, workflowInfo, wth), nil
   671  }
   672  
   673  func (wth *workflowTaskHandlerImpl) getOrCreateWorkflowContext(
   674  	task *s.PollForDecisionTaskResponse,
   675  	historyIterator HistoryIterator,
   676  ) (workflowContext *workflowExecutionContextImpl, err error) {
   677  	metricsScope := wth.metricsScope.GetTaggedScope(tagWorkflowType, task.WorkflowType.GetName())
   678  	defer func(metricsScope tally.Scope) {
   679  		if err == nil && workflowContext != nil && workflowContext.laTunnel == nil {
   680  			workflowContext.laTunnel = wth.laTunnel
   681  		}
   682  		metricsScope.Gauge(metrics.StickyCacheSize).Update(float64(getWorkflowCache().Size()))
   683  	}(metricsScope)
   684  
   685  	runID := task.WorkflowExecution.GetRunId()
   686  
   687  	history := task.History
   688  	isFullHistory := isFullHistory(history)
   689  
   690  	workflowContext = nil
   691  	if task.Query == nil || (task.Query != nil && !isFullHistory) {
   692  		workflowContext = getWorkflowContext(runID)
   693  	}
   694  
   695  	if workflowContext != nil {
   696  		workflowContext.Lock()
   697  		// add new tag on metrics scope with workflow runtime length category
   698  		scope := metricsScope.Tagged(map[string]string{tagWorkflowRuntimeLength: workflowCategorizedByTimeout(workflowContext)})
   699  		if task.Query != nil && !isFullHistory {
   700  			// query task and we have a valid cached state
   701  			scope.Counter(metrics.StickyCacheHit).Inc(1)
   702  		} else if history.Events[0].GetEventId() == workflowContext.previousStartedEventID+1 {
   703  			// non query task and we have a valid cached state
   704  			scope.Counter(metrics.StickyCacheHit).Inc(1)
   705  		} else {
   706  			// non query task and cached state is missing events, we need to discard the cached state and rebuild one.
   707  			workflowContext.ResetIfStale(task, historyIterator)
   708  		}
   709  	} else {
   710  		if !isFullHistory {
   711  			// we are getting partial history task, but cached state was already evicted.
   712  			// we need to reset history so we get events from beginning to replay/rebuild the state
   713  			metricsScope.Counter(metrics.StickyCacheMiss).Inc(1)
   714  			if history, err = resetHistory(task, historyIterator); err != nil {
   715  				return
   716  			}
   717  		}
   718  
   719  		if workflowContext, err = wth.createWorkflowContext(task); err != nil {
   720  			return
   721  		}
   722  
   723  		if !wth.disableStickyExecution && task.Query == nil {
   724  			workflowContext, _ = putWorkflowContext(runID, workflowContext)
   725  		}
   726  		workflowContext.Lock()
   727  	}
   728  
   729  	err = workflowContext.resetStateIfDestroyed(task, historyIterator)
   730  	if err != nil {
   731  		workflowContext.Unlock(err)
   732  	}
   733  
   734  	return
   735  }
   736  
   737  func isFullHistory(history *s.History) bool {
   738  	if len(history.Events) == 0 || history.Events[0].GetEventType() != s.EventTypeWorkflowExecutionStarted {
   739  		return false
   740  	}
   741  	return true
   742  }
   743  
   744  func (w *workflowExecutionContextImpl) resetStateIfDestroyed(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) error {
   745  	// It is possible that 2 threads (one for decision task and one for query task) that both are getting this same
   746  	// cached workflowContext. If one task finished with err, it would destroy the cached state. In that case, the
   747  	// second task needs to reset the cache state and start from beginning of the history.
   748  	if w.IsDestroyed() {
   749  		w.createEventHandler()
   750  		// reset history events if necessary
   751  		if !isFullHistory(task.History) {
   752  			if _, err := resetHistory(task, historyIterator); err != nil {
   753  				return err
   754  			}
   755  		}
   756  	}
   757  	return nil
   758  }
   759  
   760  // ProcessWorkflowTask processes all the events of the workflow task.
   761  func (wth *workflowTaskHandlerImpl) ProcessWorkflowTask(
   762  	workflowTask *workflowTask,
   763  	heartbeatFunc decisionHeartbeatFunc,
   764  ) (completeRequest interface{}, errRet error) {
   765  	if workflowTask == nil || workflowTask.task == nil {
   766  		return nil, errors.New("nil workflow task provided")
   767  	}
   768  	task := workflowTask.task
   769  	if task.History == nil || len(task.History.Events) == 0 {
   770  		task.History = &s.History{
   771  			Events: []*s.HistoryEvent{},
   772  		}
   773  	}
   774  	if task.Query == nil && len(task.History.Events) == 0 {
   775  		return nil, errors.New("nil or empty history")
   776  	}
   777  
   778  	if task.Query != nil && len(task.Queries) != 0 {
   779  		return nil, errors.New("invalid query decision task")
   780  	}
   781  
   782  	runID := task.WorkflowExecution.GetRunId()
   783  	workflowID := task.WorkflowExecution.GetWorkflowId()
   784  	traceLog(func() {
   785  		wth.logger.Debug("Processing new workflow task.",
   786  			zap.String(tagWorkflowType, task.WorkflowType.GetName()),
   787  			zap.String(tagWorkflowID, workflowID),
   788  			zap.String(tagRunID, runID),
   789  			zap.Int64("PreviousStartedEventId", task.GetPreviousStartedEventId()))
   790  	})
   791  
   792  	workflowContext, err := wth.getOrCreateWorkflowContext(task, workflowTask.historyIterator)
   793  	if err != nil {
   794  		return nil, err
   795  	}
   796  
   797  	defer func() {
   798  		workflowContext.Unlock(errRet)
   799  	}()
   800  
   801  	var response interface{}
   802  process_Workflow_Loop:
   803  	for {
   804  		startTime := time.Now()
   805  		response, err = workflowContext.ProcessWorkflowTask(workflowTask)
   806  		if err == nil && response == nil {
   807  		wait_LocalActivity_Loop:
   808  			for {
   809  				deadlineToTrigger := time.Duration(float32(ratioToForceCompleteDecisionTaskComplete) * float32(workflowContext.GetDecisionTimeout()))
   810  				delayDuration := startTime.Add(deadlineToTrigger).Sub(time.Now())
   811  				select {
   812  				case <-time.After(delayDuration):
   813  					// force complete, call the decision heartbeat function
   814  					workflowTask, err = heartbeatFunc(
   815  						workflowContext.CompleteDecisionTask(workflowTask, false),
   816  						startTime,
   817  					)
   818  					if err != nil {
   819  						return nil, &decisionHeartbeatError{Message: fmt.Sprintf("error sending decision heartbeat %v", err)}
   820  					}
   821  					if workflowTask == nil {
   822  						return nil, nil
   823  					}
   824  					continue process_Workflow_Loop
   825  
   826  				case lar := <-workflowTask.laResultCh:
   827  					// local activity result ready
   828  					response, err = workflowContext.ProcessLocalActivityResult(workflowTask, lar)
   829  					if err == nil && response == nil {
   830  						// decision task is not done yet, still waiting for more local activities
   831  						continue wait_LocalActivity_Loop
   832  					}
   833  					break process_Workflow_Loop
   834  				}
   835  			}
   836  		} else {
   837  			break process_Workflow_Loop
   838  		}
   839  	}
   840  	return response, err
   841  }
   842  
   843  func (w *workflowExecutionContextImpl) ProcessWorkflowTask(workflowTask *workflowTask) (interface{}, error) {
   844  	task := workflowTask.task
   845  	historyIterator := workflowTask.historyIterator
   846  	w.workflowInfo.HistoryBytesServer = task.GetTotalHistoryBytes()
   847  	w.workflowInfo.HistoryCount = task.GetNextEventId() - 1
   848  	if err := w.ResetIfStale(task, historyIterator); err != nil {
   849  		return nil, err
   850  	}
   851  	w.SetCurrentTask(task)
   852  
   853  	eventHandler := w.getEventHandler()
   854  	reorderedHistory := newHistory(workflowTask, eventHandler)
   855  	var replayDecisions []*s.Decision
   856  	var respondEvents []*s.HistoryEvent
   857  
   858  	skipReplayCheck := w.skipReplayCheck()
   859  	isReplayTest := task.GetPreviousStartedEventId() == replayPreviousStartedEventID
   860  	if isReplayTest {
   861  		w.wth.logger.Info("Processing workflow task in replay test mode",
   862  			zap.String(tagWorkflowType, task.WorkflowType.GetName()),
   863  			zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
   864  			zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
   865  		)
   866  	}
   867  	// Process events
   868  ProcessEvents:
   869  	for {
   870  		reorderedEvents, markers, binaryChecksum, err := reorderedHistory.NextDecisionEvents()
   871  		w.wth.metricsScope.GetTaggedScope("workflowtype", w.workflowInfo.WorkflowType.Name).Gauge(metrics.EstimatedHistorySize).Update(float64(w.workflowInfo.TotalHistoryBytes))
   872  		w.wth.metricsScope.GetTaggedScope("workflowtype", w.workflowInfo.WorkflowType.Name).Gauge(metrics.ServerSideHistorySize).Update(float64(w.workflowInfo.HistoryBytesServer))
   873  		if err != nil {
   874  			return nil, err
   875  		}
   876  
   877  		if len(reorderedEvents) == 0 {
   878  			break ProcessEvents
   879  		}
   880  		if binaryChecksum == nil {
   881  			w.workflowInfo.BinaryChecksum = common.StringPtr(getBinaryChecksum())
   882  		} else {
   883  			w.workflowInfo.BinaryChecksum = binaryChecksum
   884  		}
   885  		// Markers are from the events that are produced from the current decision
   886  		for _, m := range markers {
   887  			if m.MarkerRecordedEventAttributes.GetMarkerName() != localActivityMarkerName {
   888  				// local activity marker needs to be applied after decision task started event
   889  				err := eventHandler.ProcessEvent(m, true, false)
   890  				if err != nil {
   891  					return nil, err
   892  				}
   893  				if w.isWorkflowCompleted {
   894  					break ProcessEvents
   895  				}
   896  			}
   897  		}
   898  
   899  		for i, event := range reorderedEvents {
   900  			isInReplay := reorderedHistory.IsReplayEvent(event)
   901  			isLast := !isInReplay && i == len(reorderedEvents)-1
   902  			if !skipReplayCheck && isDecisionEvent(event.GetEventType()) {
   903  				respondEvents = append(respondEvents, event)
   904  			}
   905  
   906  			if isPreloadMarkerEvent(event) {
   907  				// marker events are processed separately
   908  				continue
   909  			}
   910  
   911  			// Any pressure points.
   912  			err := w.wth.executeAnyPressurePoints(event, isInReplay)
   913  			if err != nil {
   914  				return nil, err
   915  			}
   916  
   917  			err = eventHandler.ProcessEvent(event, isInReplay, isLast)
   918  			if err != nil {
   919  				return nil, err
   920  			}
   921  			if w.isWorkflowCompleted {
   922  				break ProcessEvents
   923  			}
   924  		}
   925  
   926  		// now apply local activity markers
   927  		for _, m := range markers {
   928  			if m.MarkerRecordedEventAttributes.GetMarkerName() == localActivityMarkerName {
   929  				err := eventHandler.ProcessEvent(m, true, false)
   930  				if err != nil {
   931  					return nil, err
   932  				}
   933  				if w.isWorkflowCompleted {
   934  					break ProcessEvents
   935  				}
   936  			}
   937  		}
   938  		isReplay := len(reorderedEvents) > 0 && reorderedHistory.IsReplayEvent(reorderedEvents[len(reorderedEvents)-1])
   939  		lastDecisionEventsForReplayTest := isReplayTest && !reorderedHistory.HasNextDecisionEvents()
   940  		if isReplay && !lastDecisionEventsForReplayTest {
   941  			eventDecisions := eventHandler.decisionsHelper.getDecisions(true)
   942  			if len(eventDecisions) > 0 && !skipReplayCheck {
   943  				replayDecisions = append(replayDecisions, eventDecisions...)
   944  			}
   945  		}
   946  	}
   947  
   948  	// Non-deterministic error could happen in 2 different places:
   949  	//   1) the replay decisions does not match to history events. This is usually due to non backwards compatible code
   950  	// change to decider logic. For example, change calling one activity to a different activity.
   951  	//   2) the decision state machine is trying to make illegal state transition while replay a history event (like
   952  	// activity task completed), but the corresponding decider code that start the event has been removed. In that case
   953  	// the replay of that event will panic on the decision state machine and the workflow will be marked as completed
   954  	// with the panic error.
   955  	var nonDeterministicErr error
   956  	var nonDeterminismType nonDeterminismDetectionType
   957  	if !skipReplayCheck && !w.isWorkflowCompleted || isReplayTest {
   958  		// check if decisions from reply matches to the history events
   959  		if err := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents); err != nil {
   960  			nonDeterministicErr = err
   961  			nonDeterminismType = nonDeterminismDetectionTypeReplayComparison
   962  		}
   963  	} else if panicErr, ok := w.getWorkflowPanicIfIllegaleStatePanic(); ok {
   964  		// This is a nondeterministic execution which ended up panicking
   965  		nonDeterministicErr = panicErr
   966  		nonDeterminismType = nonDeterminismDetectionTypeIllegalStatePanic
   967  		// Since we know there is an error, we do the replay check to give more context in the log
   968  		replayErr := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents)
   969  		w.wth.logger.Error("Illegal state caused panic",
   970  			zap.String(tagWorkflowType, task.WorkflowType.GetName()),
   971  			zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
   972  			zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
   973  			zap.Error(nonDeterministicErr),
   974  			zap.NamedError("ReplayError", replayErr),
   975  		)
   976  	}
   977  
   978  	if nonDeterministicErr != nil {
   979  		scope := w.wth.metricsScope.GetTaggedScope(tagWorkflowType, task.WorkflowType.GetName(), tagNonDeterminismDetectionType, string(nonDeterminismType))
   980  		scope.Counter(metrics.NonDeterministicError).Inc(1)
   981  		w.wth.logger.Error("non-deterministic-error",
   982  			zap.String(tagWorkflowType, task.WorkflowType.GetName()),
   983  			zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
   984  			zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
   985  			zap.Error(nonDeterministicErr))
   986  
   987  		switch w.wth.nonDeterministicWorkflowPolicy {
   988  		case NonDeterministicWorkflowPolicyFailWorkflow:
   989  			// complete workflow with custom error will fail the workflow
   990  			eventHandler.Complete(nil, NewCustomError("NonDeterministicWorkflowPolicyFailWorkflow", nonDeterministicErr.Error()))
   991  		case NonDeterministicWorkflowPolicyBlockWorkflow:
   992  			// return error here will be convert to DecisionTaskFailed for the first time, and ignored for subsequent
   993  			// attempts which will cause DecisionTaskTimeout and server will retry forever until issue got fixed or
   994  			// workflow timeout.
   995  			return nil, nonDeterministicErr
   996  		default:
   997  			panic("unknown mismatched workflow history policy.")
   998  		}
   999  	}
  1000  
  1001  	return w.CompleteDecisionTask(workflowTask, true), nil
  1002  }
  1003  
  1004  func (w *workflowExecutionContextImpl) ProcessLocalActivityResult(workflowTask *workflowTask, lar *localActivityResult) (interface{}, error) {
  1005  	if lar.err != nil && w.retryLocalActivity(lar) {
  1006  		return nil, nil // nothing to do here as we are retrying...
  1007  	}
  1008  
  1009  	err := w.getEventHandler().ProcessLocalActivityResult(lar)
  1010  	if err != nil {
  1011  		return nil, err
  1012  	}
  1013  
  1014  	return w.CompleteDecisionTask(workflowTask, true), nil
  1015  }
  1016  
  1017  func (w *workflowExecutionContextImpl) retryLocalActivity(lar *localActivityResult) bool {
  1018  	if lar.task.retryPolicy == nil || lar.err == nil || IsCanceledError(lar.err) {
  1019  		return false
  1020  	}
  1021  
  1022  	backoff := getRetryBackoff(lar, time.Now())
  1023  	if backoff > 0 && backoff <= w.GetDecisionTimeout() {
  1024  		// we need a local retry
  1025  		time.AfterFunc(backoff, func() {
  1026  			// TODO: this should not be a separate goroutine as it introduces race condition when accessing eventHandler.
  1027  			// currently this is solved by changing eventHandler to an atomic.Value. Ideally, this retry timer should be
  1028  			// part of the event loop for processing the workflow task.
  1029  			eventHandler := w.getEventHandler()
  1030  
  1031  			// if decision heartbeat failed, the workflow execution context will be cleared and eventHandler will be nil
  1032  			if eventHandler == nil {
  1033  				return
  1034  			}
  1035  
  1036  			if _, ok := eventHandler.pendingLaTasks[lar.task.activityID]; !ok {
  1037  				return
  1038  			}
  1039  
  1040  			lar.task.attempt++
  1041  
  1042  			if !w.laTunnel.sendTask(lar.task) {
  1043  				lar.task.attempt--
  1044  			}
  1045  		})
  1046  		return true
  1047  	}
  1048  	// Backoff could be large and potentially much larger than DecisionTaskTimeout. We cannot just sleep locally for
  1049  	// retry. Because it will delay the local activity from complete which keeps the decision task open. In order to
  1050  	// keep decision task open, we have to keep "heartbeating" current decision task.
  1051  	// In that case, it is more efficient to create a server timer with backoff duration and retry when that backoff
  1052  	// timer fires. So here we will return false to indicate we don't need local retry anymore. However, we have to
  1053  	// store the current attempt and backoff to the same LocalActivityResultMarker so the replay can do the right thing.
  1054  	// The backoff timer will be created by workflow.ExecuteLocalActivity().
  1055  	lar.backoff = backoff
  1056  
  1057  	return false
  1058  }
  1059  
  1060  func getRetryBackoff(lar *localActivityResult, now time.Time) time.Duration {
  1061  	p := lar.task.retryPolicy
  1062  	var errReason string
  1063  	if len(p.NonRetriableErrorReasons) > 0 {
  1064  		if lar.err == ErrDeadlineExceeded {
  1065  			errReason = "timeout:" + s.TimeoutTypeScheduleToClose.String()
  1066  		} else {
  1067  			errReason, _ = getErrorDetails(lar.err, nil)
  1068  		}
  1069  	}
  1070  	return getRetryBackoffWithNowTime(p, lar.task.attempt, errReason, now, lar.task.expireTime)
  1071  }
  1072  
  1073  func getRetryBackoffWithNowTime(p *RetryPolicy, attempt int32, errReason string, now, expireTime time.Time) time.Duration {
  1074  	if p.MaximumAttempts == 0 && p.ExpirationInterval == 0 {
  1075  		return noRetryBackoff
  1076  	}
  1077  
  1078  	if p.MaximumAttempts > 0 && attempt > p.MaximumAttempts-1 {
  1079  		return noRetryBackoff // max attempt reached
  1080  	}
  1081  
  1082  	backoffInterval := time.Duration(float64(p.InitialInterval) * math.Pow(p.BackoffCoefficient, float64(attempt)))
  1083  	if backoffInterval <= 0 {
  1084  		// math.Pow() could overflow
  1085  		if p.MaximumInterval > 0 {
  1086  			backoffInterval = p.MaximumInterval
  1087  		} else {
  1088  			return noRetryBackoff
  1089  		}
  1090  	}
  1091  
  1092  	if p.MaximumInterval > 0 && backoffInterval > p.MaximumInterval {
  1093  		// cap next interval to MaxInterval
  1094  		backoffInterval = p.MaximumInterval
  1095  	}
  1096  
  1097  	nextScheduleTime := now.Add(backoffInterval)
  1098  	if !expireTime.IsZero() && nextScheduleTime.After(expireTime) {
  1099  		return noRetryBackoff
  1100  	}
  1101  
  1102  	// check if error is non-retriable
  1103  	for _, er := range p.NonRetriableErrorReasons {
  1104  		if er == errReason {
  1105  			return noRetryBackoff
  1106  		}
  1107  	}
  1108  
  1109  	return backoffInterval
  1110  }
  1111  
  1112  func (w *workflowExecutionContextImpl) CompleteDecisionTask(workflowTask *workflowTask, waitLocalActivities bool) interface{} {
  1113  	if w.currentDecisionTask == nil {
  1114  		return nil
  1115  	}
  1116  	eventHandler := w.getEventHandler()
  1117  
  1118  	// w.laTunnel could be nil for worker.ReplayHistory() because there is no worker started, in that case we don't
  1119  	// care about the pending local activities, and just return because the result is ignored anyway by the caller.
  1120  	if w.hasPendingLocalActivityWork() && w.laTunnel != nil {
  1121  		if len(eventHandler.unstartedLaTasks) > 0 {
  1122  			// start new local activity tasks
  1123  			unstartedLaTasks := make(map[string]struct{})
  1124  			for activityID := range eventHandler.unstartedLaTasks {
  1125  				task := eventHandler.pendingLaTasks[activityID]
  1126  				task.wc = w
  1127  				task.workflowTask = workflowTask
  1128  				if !w.laTunnel.sendTask(task) {
  1129  					unstartedLaTasks[activityID] = struct{}{}
  1130  					task.wc = nil
  1131  					task.workflowTask = nil
  1132  				}
  1133  			}
  1134  			eventHandler.unstartedLaTasks = unstartedLaTasks
  1135  		}
  1136  		// cannot complete decision task as there are pending local activities
  1137  		if waitLocalActivities {
  1138  			return nil
  1139  		}
  1140  	}
  1141  
  1142  	eventDecisions := eventHandler.decisionsHelper.getDecisions(true)
  1143  	if len(eventDecisions) > 0 {
  1144  		w.newDecisions = append(w.newDecisions, eventDecisions...)
  1145  	}
  1146  
  1147  	completeRequest := w.wth.completeWorkflow(eventHandler, w.currentDecisionTask, w, w.newDecisions, !waitLocalActivities)
  1148  	w.clearCurrentTask()
  1149  
  1150  	return completeRequest
  1151  }
  1152  
  1153  func (w *workflowExecutionContextImpl) hasPendingLocalActivityWork() bool {
  1154  	eventHandler := w.getEventHandler()
  1155  	return !w.isWorkflowCompleted &&
  1156  		w.currentDecisionTask != nil &&
  1157  		w.currentDecisionTask.Query == nil && // don't run local activity for query task
  1158  		eventHandler != nil &&
  1159  		len(eventHandler.pendingLaTasks) > 0
  1160  }
  1161  
  1162  func (w *workflowExecutionContextImpl) clearCurrentTask() {
  1163  	w.newDecisions = nil
  1164  	w.currentDecisionTask = nil
  1165  }
  1166  
  1167  func (w *workflowExecutionContextImpl) skipReplayCheck() bool {
  1168  	return w.currentDecisionTask.Query != nil || !isFullHistory(w.currentDecisionTask.History)
  1169  }
  1170  
  1171  func (w *workflowExecutionContextImpl) SetCurrentTask(task *s.PollForDecisionTaskResponse) {
  1172  	w.currentDecisionTask = task
  1173  	// do not update the previousStartedEventID for query task
  1174  	if task.Query == nil {
  1175  		w.previousStartedEventID = task.GetStartedEventId()
  1176  	}
  1177  	w.decisionStartTime = time.Now()
  1178  }
  1179  
  1180  func (w *workflowExecutionContextImpl) ResetIfStale(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) error {
  1181  	if len(task.History.Events) > 0 && task.History.Events[0].GetEventId() != w.previousStartedEventID+1 {
  1182  		w.wth.logger.Debug("Cached state staled, new task has unexpected events",
  1183  			zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
  1184  			zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
  1185  			zap.Int64("CachedPreviousStartedEventID", w.previousStartedEventID),
  1186  			zap.Int64("TaskFirstEventID", task.History.Events[0].GetEventId()),
  1187  			zap.Int64("TaskStartedEventID", task.GetStartedEventId()),
  1188  			zap.Int64("TaskPreviousStartedEventID", task.GetPreviousStartedEventId()))
  1189  
  1190  		w.wth.metricsScope.
  1191  			GetTaggedScope(tagWorkflowType, task.WorkflowType.GetName()).
  1192  			Counter(metrics.StickyCacheStall).Inc(1)
  1193  
  1194  		w.clearState()
  1195  		return w.resetStateIfDestroyed(task, historyIterator)
  1196  	}
  1197  	return nil
  1198  }
  1199  
  1200  func (w *workflowExecutionContextImpl) GetDecisionTimeout() time.Duration {
  1201  	return time.Second * time.Duration(w.workflowInfo.TaskStartToCloseTimeoutSeconds)
  1202  }
  1203  
  1204  func (w *workflowExecutionContextImpl) getWorkflowPanicIfIllegaleStatePanic() (*workflowPanicError, bool) {
  1205  	if !w.isWorkflowCompleted || w.err == nil {
  1206  		return nil, false
  1207  	}
  1208  
  1209  	panicErr, ok := w.err.(*workflowPanicError)
  1210  	if !ok || panicErr.value == nil {
  1211  		return nil, false
  1212  	}
  1213  
  1214  	_, ok = panicErr.value.(stateMachineIllegalStatePanic)
  1215  	if !ok {
  1216  		return nil, false
  1217  	}
  1218  
  1219  	return panicErr, true
  1220  }
  1221  
  1222  func (wth *workflowTaskHandlerImpl) completeWorkflow(
  1223  	eventHandler *workflowExecutionEventHandlerImpl,
  1224  	task *s.PollForDecisionTaskResponse,
  1225  	workflowContext *workflowExecutionContextImpl,
  1226  	decisions []*s.Decision,
  1227  	forceNewDecision bool) interface{} {
  1228  
  1229  	// for query task
  1230  	if task.Query != nil {
  1231  		queryCompletedRequest := &s.RespondQueryTaskCompletedRequest{TaskToken: task.TaskToken}
  1232  		if panicErr, ok := workflowContext.err.(*PanicError); ok {
  1233  			// NOTE: this code path should never be executed, we should check for workflowPanicError instead of PanicError
  1234  			wth.logger.Warn("Encountered PanicError in workflow query task",
  1235  				zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
  1236  				zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
  1237  				zap.String(tagPanicError, panicErr.Error()),
  1238  				zap.String(tagPanicStack, panicErr.StackTrace()),
  1239  			)
  1240  
  1241  			queryCompletedRequest.CompletedType = common.QueryTaskCompletedTypePtr(s.QueryTaskCompletedTypeFailed)
  1242  			queryCompletedRequest.ErrorMessage = common.StringPtr("Workflow panic: " + panicErr.Error())
  1243  			return queryCompletedRequest
  1244  		}
  1245  
  1246  		if workflowPanicErr, ok := workflowContext.err.(*workflowPanicError); ok {
  1247  			// NOTE: in this case we should return complete query task with CompletedTypeFailed
  1248  			// but we didn't check for the right error type before, this may break existing customer
  1249  			wth.logger.Warn("Ignored workflow panic error for query, query result may be partial",
  1250  				zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
  1251  				zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
  1252  				zap.String(tagPanicError, workflowPanicErr.Error()),
  1253  				zap.String(tagPanicStack, workflowPanicErr.StackTrace()),
  1254  				zap.Int64("PreviousStartedEventID", task.GetPreviousStartedEventId()),
  1255  			)
  1256  		}
  1257  
  1258  		result, err := eventHandler.ProcessQuery(task.Query.GetQueryType(), task.Query.QueryArgs)
  1259  		if err != nil {
  1260  			queryCompletedRequest.CompletedType = common.QueryTaskCompletedTypePtr(s.QueryTaskCompletedTypeFailed)
  1261  			queryCompletedRequest.ErrorMessage = common.StringPtr(err.Error())
  1262  		} else {
  1263  			queryCompletedRequest.CompletedType = common.QueryTaskCompletedTypePtr(s.QueryTaskCompletedTypeCompleted)
  1264  			queryCompletedRequest.QueryResult = result
  1265  		}
  1266  		return queryCompletedRequest
  1267  	}
  1268  
  1269  	metricsScope := wth.metricsScope.GetTaggedScope(tagWorkflowType, eventHandler.workflowEnvironmentImpl.workflowInfo.WorkflowType.Name)
  1270  
  1271  	// fail decision task on decider panic
  1272  	if panicErr, ok := workflowContext.err.(*workflowPanicError); ok {
  1273  		// Workflow panic
  1274  		metricsScope.Counter(metrics.DecisionTaskPanicCounter).Inc(1)
  1275  		wth.logger.Error("Workflow panic.",
  1276  			zap.String(tagWorkflowType, task.WorkflowType.GetName()),
  1277  			zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
  1278  			zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
  1279  			zap.String(tagPanicError, panicErr.Error()),
  1280  			zap.String(tagPanicStack, panicErr.StackTrace()))
  1281  		return errorToFailDecisionTask(task.TaskToken, panicErr, wth.identity)
  1282  	}
  1283  
  1284  	// complete decision task
  1285  	var closeDecision *s.Decision
  1286  	if canceledErr, ok := workflowContext.err.(*CanceledError); ok {
  1287  		// Workflow cancelled
  1288  		metricsScope.Counter(metrics.WorkflowCanceledCounter).Inc(1)
  1289  		closeDecision = createNewDecision(s.DecisionTypeCancelWorkflowExecution)
  1290  		_, details := getErrorDetails(canceledErr, wth.dataConverter)
  1291  		closeDecision.CancelWorkflowExecutionDecisionAttributes = &s.CancelWorkflowExecutionDecisionAttributes{
  1292  			Details: details,
  1293  		}
  1294  	} else if contErr, ok := workflowContext.err.(*ContinueAsNewError); ok {
  1295  		// Continue as new error.
  1296  		metricsScope.Counter(metrics.WorkflowContinueAsNewCounter).Inc(1)
  1297  		closeDecision = createNewDecision(s.DecisionTypeContinueAsNewWorkflowExecution)
  1298  		closeDecision.ContinueAsNewWorkflowExecutionDecisionAttributes = &s.ContinueAsNewWorkflowExecutionDecisionAttributes{
  1299  			WorkflowType:                        workflowTypePtr(*contErr.params.workflowType),
  1300  			Input:                               contErr.params.input,
  1301  			TaskList:                            common.TaskListPtr(s.TaskList{Name: contErr.params.taskListName}),
  1302  			ExecutionStartToCloseTimeoutSeconds: contErr.params.executionStartToCloseTimeoutSeconds,
  1303  			TaskStartToCloseTimeoutSeconds:      contErr.params.taskStartToCloseTimeoutSeconds,
  1304  			Header:                              contErr.params.header,
  1305  			Memo:                                workflowContext.workflowInfo.Memo,
  1306  			SearchAttributes:                    workflowContext.workflowInfo.SearchAttributes,
  1307  			RetryPolicy:                         workflowContext.workflowInfo.RetryPolicy,
  1308  		}
  1309  	} else if workflowContext.err != nil {
  1310  		// Workflow failures
  1311  		metricsScope.Counter(metrics.WorkflowFailedCounter).Inc(1)
  1312  		closeDecision = createNewDecision(s.DecisionTypeFailWorkflowExecution)
  1313  		reason, details := getErrorDetails(workflowContext.err, wth.dataConverter)
  1314  		closeDecision.FailWorkflowExecutionDecisionAttributes = &s.FailWorkflowExecutionDecisionAttributes{
  1315  			Reason:  common.StringPtr(reason),
  1316  			Details: details,
  1317  		}
  1318  	} else if workflowContext.isWorkflowCompleted {
  1319  		// Workflow completion
  1320  		metricsScope.Counter(metrics.WorkflowCompletedCounter).Inc(1)
  1321  		closeDecision = createNewDecision(s.DecisionTypeCompleteWorkflowExecution)
  1322  		closeDecision.CompleteWorkflowExecutionDecisionAttributes = &s.CompleteWorkflowExecutionDecisionAttributes{
  1323  			Result: workflowContext.result,
  1324  		}
  1325  	}
  1326  
  1327  	if closeDecision != nil {
  1328  		decisions = append(decisions, closeDecision)
  1329  		elapsed := time.Since(workflowContext.workflowStartTime)
  1330  		metricsScope.Timer(metrics.WorkflowEndToEndLatency).Record(elapsed)
  1331  		forceNewDecision = false
  1332  	}
  1333  
  1334  	var queryResults map[string]*s.WorkflowQueryResult
  1335  	if len(task.Queries) != 0 {
  1336  		queryResults = make(map[string]*s.WorkflowQueryResult)
  1337  		for queryID, query := range task.Queries {
  1338  			result, err := eventHandler.ProcessQuery(query.GetQueryType(), query.QueryArgs)
  1339  			if err != nil {
  1340  				queryResults[queryID] = &s.WorkflowQueryResult{
  1341  					ResultType:   common.QueryResultTypePtr(s.QueryResultTypeFailed),
  1342  					ErrorMessage: common.StringPtr(err.Error()),
  1343  				}
  1344  			} else {
  1345  				queryResults[queryID] = &s.WorkflowQueryResult{
  1346  					ResultType: common.QueryResultTypePtr(s.QueryResultTypeAnswered),
  1347  					Answer:     result,
  1348  				}
  1349  			}
  1350  		}
  1351  	}
  1352  
  1353  	return &s.RespondDecisionTaskCompletedRequest{
  1354  		TaskToken:                  task.TaskToken,
  1355  		Decisions:                  decisions,
  1356  		Identity:                   common.StringPtr(wth.identity),
  1357  		ReturnNewDecisionTask:      common.BoolPtr(true),
  1358  		ForceCreateNewDecisionTask: common.BoolPtr(forceNewDecision),
  1359  		BinaryChecksum:             common.StringPtr(getBinaryChecksum()),
  1360  		QueryResults:               queryResults,
  1361  	}
  1362  }
  1363  
  1364  func errorToFailDecisionTask(taskToken []byte, err error, identity string) *s.RespondDecisionTaskFailedRequest {
  1365  	failedCause := s.DecisionTaskFailedCauseWorkflowWorkerUnhandledFailure
  1366  	_, details := getErrorDetails(err, nil)
  1367  	return &s.RespondDecisionTaskFailedRequest{
  1368  		TaskToken:      taskToken,
  1369  		Cause:          &failedCause,
  1370  		Details:        details,
  1371  		Identity:       common.StringPtr(identity),
  1372  		BinaryChecksum: common.StringPtr(getBinaryChecksum()),
  1373  	}
  1374  }
  1375  
  1376  func (wth *workflowTaskHandlerImpl) executeAnyPressurePoints(event *s.HistoryEvent, isInReplay bool) error {
  1377  	if wth.ppMgr != nil && !reflect.ValueOf(wth.ppMgr).IsNil() && !isInReplay {
  1378  		switch event.GetEventType() {
  1379  		case s.EventTypeDecisionTaskStarted:
  1380  			return wth.ppMgr.Execute(pressurePointTypeDecisionTaskStartTimeout)
  1381  		case s.EventTypeActivityTaskScheduled:
  1382  			return wth.ppMgr.Execute(pressurePointTypeActivityTaskScheduleTimeout)
  1383  		case s.EventTypeActivityTaskStarted:
  1384  			return wth.ppMgr.Execute(pressurePointTypeActivityTaskStartTimeout)
  1385  		case s.EventTypeDecisionTaskCompleted:
  1386  			return wth.ppMgr.Execute(pressurePointTypeDecisionTaskCompleted)
  1387  		}
  1388  	}
  1389  	return nil
  1390  }
  1391  
  1392  func newActivityTaskHandler(
  1393  	service workflowserviceclient.Interface,
  1394  	params workerExecutionParameters,
  1395  	registry *registry,
  1396  ) ActivityTaskHandler {
  1397  	return newActivityTaskHandlerWithCustomProvider(service, params, registry, nil)
  1398  }
  1399  
  1400  func newActivityTaskHandlerWithCustomProvider(
  1401  	service workflowserviceclient.Interface,
  1402  	params workerExecutionParameters,
  1403  	registry *registry,
  1404  	activityProvider activityProvider,
  1405  ) ActivityTaskHandler {
  1406  	return &activityTaskHandlerImpl{
  1407  		taskListName:       params.TaskList,
  1408  		identity:           params.Identity,
  1409  		service:            service,
  1410  		logger:             params.Logger,
  1411  		metricsScope:       metrics.NewTaggedScope(params.MetricsScope),
  1412  		userContext:        params.UserContext,
  1413  		registry:           registry,
  1414  		activityProvider:   activityProvider,
  1415  		dataConverter:      params.DataConverter,
  1416  		workerStopCh:       params.WorkerStopChannel,
  1417  		contextPropagators: params.ContextPropagators,
  1418  		tracer:             params.Tracer,
  1419  		featureFlags:       params.FeatureFlags,
  1420  	}
  1421  }
  1422  
  1423  type cadenceInvoker struct {
  1424  	sync.Mutex
  1425  	identity              string
  1426  	service               workflowserviceclient.Interface
  1427  	taskToken             []byte
  1428  	cancelHandler         func()
  1429  	heartBeatTimeoutInSec int32       // The heart beat interval configured for this activity.
  1430  	hbBatchEndTimer       *time.Timer // Whether we started a batch of operations that need to be reported in the cycle. This gets started on a user call.
  1431  	detailsToReport       *[]byte     // Details to be reported in the next reporting interval.
  1432  	lastDetailsReported   *[]byte     // Details that were reported in the last reporting interval.
  1433  	closeCh               chan struct{}
  1434  	workerStopChannel     <-chan struct{}
  1435  	featureFlags          FeatureFlags
  1436  	logger                *zap.Logger
  1437  	workflowType          string
  1438  	activityType          string
  1439  }
  1440  
  1441  func (i *cadenceInvoker) Heartbeat(details []byte) error {
  1442  	i.Lock()
  1443  	defer i.Unlock()
  1444  
  1445  	_, err := i.internalHeartBeat(details)
  1446  	return err
  1447  }
  1448  
  1449  func (i *cadenceInvoker) BackgroundHeartbeat() error {
  1450  	i.Lock()
  1451  	defer i.Unlock()
  1452  
  1453  	if i.hbBatchEndTimer != nil {
  1454  		if i.detailsToReport == nil {
  1455  			i.detailsToReport = i.lastDetailsReported
  1456  		}
  1457  
  1458  		return nil
  1459  	}
  1460  
  1461  	var details []byte
  1462  	if i.detailsToReport != nil {
  1463  		details = *i.detailsToReport
  1464  	} else if i.lastDetailsReported != nil {
  1465  		details = *i.lastDetailsReported
  1466  	}
  1467  
  1468  	return i.heartbeatAndScheduleNextRun(details)
  1469  }
  1470  
  1471  func (i *cadenceInvoker) BatchHeartbeat(details []byte) error {
  1472  	i.Lock()
  1473  	defer i.Unlock()
  1474  
  1475  	if i.hbBatchEndTimer != nil {
  1476  		// If we have started batching window, keep track of last reported progress.
  1477  		i.detailsToReport = &details
  1478  		return nil
  1479  	}
  1480  
  1481  	return i.heartbeatAndScheduleNextRun(details)
  1482  }
  1483  
  1484  func (i *cadenceInvoker) heartbeatAndScheduleNextRun(details []byte) error {
  1485  	isActivityCancelled, err := i.internalHeartBeat(details)
  1486  
  1487  	// If the activity is cancelled, the activity can ignore the cancellation and do its work
  1488  	// and complete. Our cancellation is co-operative, so we will try to heartbeat.
  1489  	if err == nil || isActivityCancelled {
  1490  		// We have successfully sent heartbeat, start next batching window.
  1491  		i.lastDetailsReported = &details
  1492  		i.detailsToReport = nil
  1493  
  1494  		// Create timer to fire before the threshold to report.
  1495  		deadlineToTrigger := i.heartBeatTimeoutInSec
  1496  		if deadlineToTrigger <= 0 {
  1497  			// If we don't have any heartbeat timeout configured.
  1498  			deadlineToTrigger = defaultHeartBeatIntervalInSec
  1499  		}
  1500  
  1501  		// We set a deadline at 80% of the timeout.
  1502  		duration := time.Duration(0.8*float32(deadlineToTrigger)) * time.Second
  1503  		i.hbBatchEndTimer = time.NewTimer(duration)
  1504  
  1505  		go func() {
  1506  			select {
  1507  			case <-i.hbBatchEndTimer.C:
  1508  				// We are close to deadline.
  1509  			case <-i.workerStopChannel:
  1510  				// Activity worker is close to stop. This does the same steps as batch timer ends.
  1511  			case <-i.closeCh:
  1512  				// We got closed.
  1513  				return
  1514  			}
  1515  
  1516  			// We close the batch and report the progress.
  1517  			var detailsToReport *[]byte
  1518  
  1519  			i.Lock()
  1520  			detailsToReport = i.detailsToReport
  1521  			i.hbBatchEndTimer.Stop()
  1522  			i.hbBatchEndTimer = nil
  1523  
  1524  			var err error
  1525  			if detailsToReport != nil {
  1526  				err = i.heartbeatAndScheduleNextRun(*detailsToReport)
  1527  			}
  1528  			i.Unlock()
  1529  
  1530  			// Log the error outside the lock.
  1531  			i.logFailedHeartBeat(err)
  1532  		}()
  1533  	}
  1534  
  1535  	return err
  1536  }
  1537  
  1538  func (i *cadenceInvoker) logFailedHeartBeat(err error) {
  1539  	// If the error is a canceled error do not log, as this is expected.
  1540  	var canceledErr *CanceledError
  1541  
  1542  	// We need to check for nil as errors.As returns false for nil. Which would cause us to log on nil.
  1543  	if err != nil && !errors.As(err, &canceledErr) {
  1544  		i.logger.Error("Failed to send heartbeat", zap.Error(err), zap.String(tagWorkflowType, i.workflowType), zap.String(tagActivityType, i.activityType))
  1545  	}
  1546  }
  1547  
  1548  func (i *cadenceInvoker) internalHeartBeat(details []byte) (bool, error) {
  1549  	isActivityCancelled := false
  1550  	timeout := time.Duration(i.heartBeatTimeoutInSec) * time.Second
  1551  	if timeout <= 0 {
  1552  		timeout = time.Duration(defaultHeartBeatIntervalInSec) * time.Second
  1553  	}
  1554  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
  1555  	defer cancel()
  1556  
  1557  	err := recordActivityHeartbeat(ctx, i.service, i.identity, i.taskToken, details, i.featureFlags)
  1558  
  1559  	switch err.(type) {
  1560  	case *CanceledError:
  1561  		// We are asked to cancel. inform the activity about cancellation through context.
  1562  		i.cancelHandler()
  1563  		isActivityCancelled = true
  1564  
  1565  	case *s.EntityNotExistsError, *s.WorkflowExecutionAlreadyCompletedError, *s.DomainNotActiveError:
  1566  		// We will pass these through as cancellation for now but something we can change
  1567  		// later when we have setter on cancel handler.
  1568  		i.cancelHandler()
  1569  		isActivityCancelled = true
  1570  	}
  1571  
  1572  	// We don't want to bubble temporary errors to the user.
  1573  	// This error won't be return to user check RecordActivityHeartbeat().
  1574  	return isActivityCancelled, err
  1575  }
  1576  
  1577  func (i *cadenceInvoker) Close(flushBufferedHeartbeat bool) {
  1578  	i.Lock()
  1579  	defer i.Unlock()
  1580  	close(i.closeCh)
  1581  	if i.hbBatchEndTimer != nil {
  1582  		i.hbBatchEndTimer.Stop()
  1583  		if flushBufferedHeartbeat && i.detailsToReport != nil {
  1584  			i.internalHeartBeat(*i.detailsToReport)
  1585  			i.lastDetailsReported = i.detailsToReport
  1586  			i.detailsToReport = nil
  1587  		}
  1588  	}
  1589  }
  1590  
  1591  func (i *cadenceInvoker) SignalWorkflow(ctx context.Context, domain, workflowID, runID, signalName string, signalInput []byte) error {
  1592  	return signalWorkflow(ctx, i.service, i.identity, domain, workflowID, runID, signalName, signalInput, i.featureFlags)
  1593  }
  1594  
  1595  func newServiceInvoker(
  1596  	taskToken []byte,
  1597  	identity string,
  1598  	service workflowserviceclient.Interface,
  1599  	cancelHandler func(),
  1600  	heartBeatTimeoutInSec int32,
  1601  	workerStopChannel <-chan struct{},
  1602  	featureFlags FeatureFlags,
  1603  	logger *zap.Logger,
  1604  	workflowType string,
  1605  	activityType string,
  1606  ) ServiceInvoker {
  1607  	return &cadenceInvoker{
  1608  		taskToken:             taskToken,
  1609  		identity:              identity,
  1610  		service:               service,
  1611  		cancelHandler:         cancelHandler,
  1612  		heartBeatTimeoutInSec: heartBeatTimeoutInSec,
  1613  		closeCh:               make(chan struct{}),
  1614  		workerStopChannel:     workerStopChannel,
  1615  		featureFlags:          featureFlags,
  1616  		logger:                logger,
  1617  		workflowType:          workflowType,
  1618  		activityType:          activityType,
  1619  	}
  1620  }
  1621  
  1622  // Execute executes an implementation of the activity.
  1623  func (ath *activityTaskHandlerImpl) Execute(taskList string, t *s.PollForActivityTaskResponse) (result interface{}, err error) {
  1624  	traceLog(func() {
  1625  		ath.logger.Debug("Processing new activity task",
  1626  			zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()),
  1627  			zap.String(tagRunID, t.WorkflowExecution.GetRunId()),
  1628  			zap.String(tagActivityType, t.ActivityType.GetName()))
  1629  	})
  1630  
  1631  	rootCtx := ath.userContext
  1632  	if rootCtx == nil {
  1633  		rootCtx = context.Background()
  1634  	}
  1635  	canCtx, cancel := context.WithCancel(rootCtx)
  1636  	defer cancel()
  1637  
  1638  	workflowType := t.WorkflowType.GetName()
  1639  	activityType := t.ActivityType.GetName()
  1640  	invoker := newServiceInvoker(t.TaskToken, ath.identity, ath.service, cancel, t.GetHeartbeatTimeoutSeconds(), ath.workerStopCh, ath.featureFlags, ath.logger, workflowType, activityType)
  1641  	defer func() {
  1642  		_, activityCompleted := result.(*s.RespondActivityTaskCompletedRequest)
  1643  		invoker.Close(!activityCompleted) // flush buffered heartbeat if activity was not successfully completed.
  1644  	}()
  1645  
  1646  	metricsScope := getMetricsScopeForActivity(ath.metricsScope, workflowType, activityType)
  1647  	ctx := WithActivityTask(canCtx, t, taskList, invoker, ath.logger, metricsScope, ath.dataConverter, ath.workerStopCh, ath.contextPropagators, ath.tracer)
  1648  
  1649  	activityImplementation := ath.getActivity(activityType)
  1650  	if activityImplementation == nil {
  1651  		// Couldn't find the activity implementation.
  1652  		supported := strings.Join(ath.getRegisteredActivityNames(), ", ")
  1653  		return nil, fmt.Errorf("unable to find activityType=%v. Supported types: [%v]", activityType, supported)
  1654  	}
  1655  
  1656  	// panic handler
  1657  	defer func() {
  1658  		if p := recover(); p != nil {
  1659  			topLine := fmt.Sprintf("activity for %s [panic]:", ath.taskListName)
  1660  			st := getStackTraceRaw(topLine, 7, 0)
  1661  			ath.logger.Error("Activity panic.",
  1662  				zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()),
  1663  				zap.String(tagRunID, t.WorkflowExecution.GetRunId()),
  1664  				zap.String(tagActivityType, activityType),
  1665  				zap.String(tagPanicError, fmt.Sprintf("%v", p)),
  1666  				zap.String(tagPanicStack, st))
  1667  			metricsScope.Counter(metrics.ActivityTaskPanicCounter).Inc(1)
  1668  			panicErr := newPanicError(p, st)
  1669  			result, err = convertActivityResultToRespondRequest(ath.identity, t.TaskToken, nil, panicErr, ath.dataConverter), nil
  1670  		}
  1671  	}()
  1672  
  1673  	// propagate context information into the activity context from the headers
  1674  	for _, ctxProp := range ath.contextPropagators {
  1675  		var err error
  1676  		if ctx, err = ctxProp.Extract(ctx, NewHeaderReader(t.Header)); err != nil {
  1677  			return nil, fmt.Errorf("unable to propagate context %v", err)
  1678  		}
  1679  	}
  1680  
  1681  	info := ctx.Value(activityEnvContextKey).(*activityEnvironment)
  1682  	ctx, dlCancelFunc := context.WithDeadline(ctx, info.deadline)
  1683  	defer dlCancelFunc()
  1684  
  1685  	ctx, span := createOpenTracingActivitySpan(ctx, ath.tracer, time.Now(), activityType, t.WorkflowExecution.GetWorkflowId(), t.WorkflowExecution.GetRunId())
  1686  	defer span.Finish()
  1687  
  1688  	if activityImplementation.GetOptions().EnableAutoHeartbeat && t.HeartbeatTimeoutSeconds != nil && *t.HeartbeatTimeoutSeconds > 0 {
  1689  		go func() {
  1690  			autoHbInterval := time.Duration(*t.HeartbeatTimeoutSeconds) * time.Second / 2
  1691  			ticker := time.NewTicker(autoHbInterval)
  1692  			defer ticker.Stop()
  1693  			for {
  1694  				select {
  1695  				case <-ath.workerStopCh:
  1696  					return
  1697  				case <-ctx.Done():
  1698  					return
  1699  				case <-ticker.C:
  1700  					hbErr := invoker.BackgroundHeartbeat()
  1701  					if hbErr != nil && !IsCanceledError(hbErr) {
  1702  						ath.logger.Error("Activity auto heartbeat error.",
  1703  							zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()),
  1704  							zap.String(tagRunID, t.WorkflowExecution.GetRunId()),
  1705  							zap.String(tagActivityType, activityType),
  1706  							zap.Error(hbErr),
  1707  						)
  1708  					}
  1709  				}
  1710  			}
  1711  		}()
  1712  	}
  1713  
  1714  	output, err := activityImplementation.Execute(ctx, t.Input)
  1715  
  1716  	dlCancelFunc()
  1717  	if <-ctx.Done(); ctx.Err() == context.DeadlineExceeded {
  1718  		ath.logger.Warn("Activity timeout.",
  1719  			zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()),
  1720  			zap.String(tagRunID, t.WorkflowExecution.GetRunId()),
  1721  			zap.String(tagActivityType, activityType),
  1722  		)
  1723  		return nil, ctx.Err()
  1724  	}
  1725  	if err != nil && err != ErrActivityResultPending {
  1726  		ath.logger.Error("Activity error.",
  1727  			zap.String(tagWorkflowID, t.WorkflowExecution.GetWorkflowId()),
  1728  			zap.String(tagRunID, t.WorkflowExecution.GetRunId()),
  1729  			zap.String(tagActivityType, activityType),
  1730  			zap.Error(err),
  1731  		)
  1732  	}
  1733  	return convertActivityResultToRespondRequest(ath.identity, t.TaskToken, output, err, ath.dataConverter), nil
  1734  }
  1735  
  1736  func (ath *activityTaskHandlerImpl) getActivity(name string) activity {
  1737  	if ath.activityProvider != nil {
  1738  		return ath.activityProvider(name)
  1739  	}
  1740  
  1741  	if a, ok := ath.registry.GetActivity(name); ok {
  1742  		return a
  1743  	}
  1744  
  1745  	return nil
  1746  }
  1747  
  1748  func (ath *activityTaskHandlerImpl) getRegisteredActivityNames() (activityNames []string) {
  1749  	for _, a := range ath.registry.getRegisteredActivities() {
  1750  		activityNames = append(activityNames, a.ActivityType().Name)
  1751  	}
  1752  	return
  1753  }
  1754  
  1755  func createNewDecision(decisionType s.DecisionType) *s.Decision {
  1756  	return &s.Decision{
  1757  		DecisionType: common.DecisionTypePtr(decisionType),
  1758  	}
  1759  }
  1760  func signalWorkflow(
  1761  	ctx context.Context,
  1762  	service workflowserviceclient.Interface,
  1763  	identity string,
  1764  	domain string,
  1765  	workflowID string,
  1766  	runID string,
  1767  	signalName string,
  1768  	signalInput []byte,
  1769  	featureFlags FeatureFlags,
  1770  ) error {
  1771  	request := &s.SignalWorkflowExecutionRequest{
  1772  		Domain: common.StringPtr(domain),
  1773  		WorkflowExecution: &s.WorkflowExecution{
  1774  			WorkflowId: common.StringPtr(workflowID),
  1775  			RunId:      getRunID(runID),
  1776  		},
  1777  		SignalName: common.StringPtr(signalName),
  1778  		Input:      signalInput,
  1779  		Identity:   common.StringPtr(identity),
  1780  	}
  1781  
  1782  	return backoff.Retry(ctx,
  1783  		func() error {
  1784  			tchCtx, cancel, opt := newChannelContext(ctx, featureFlags)
  1785  			defer cancel()
  1786  			return service.SignalWorkflowExecution(tchCtx, request, opt...)
  1787  		}, createDynamicServiceRetryPolicy(ctx), isServiceTransientError)
  1788  }
  1789  
  1790  func recordActivityHeartbeat(
  1791  	ctx context.Context,
  1792  	service workflowserviceclient.Interface,
  1793  	identity string,
  1794  	taskToken, details []byte,
  1795  	featureFlags FeatureFlags,
  1796  ) error {
  1797  	request := &s.RecordActivityTaskHeartbeatRequest{
  1798  		TaskToken: taskToken,
  1799  		Details:   details,
  1800  		Identity:  common.StringPtr(identity)}
  1801  
  1802  	var heartbeatResponse *s.RecordActivityTaskHeartbeatResponse
  1803  	heartbeatErr := backoff.Retry(ctx,
  1804  		func() error {
  1805  			tchCtx, cancel, opt := newChannelContext(ctx, featureFlags)
  1806  			defer cancel()
  1807  
  1808  			var err error
  1809  			heartbeatResponse, err = service.RecordActivityTaskHeartbeat(tchCtx, request, opt...)
  1810  			return err
  1811  		}, createDynamicServiceRetryPolicy(ctx), isServiceTransientError)
  1812  
  1813  	if heartbeatErr == nil && heartbeatResponse != nil && heartbeatResponse.GetCancelRequested() {
  1814  		return NewCanceledError()
  1815  	}
  1816  
  1817  	return heartbeatErr
  1818  }
  1819  
  1820  func recordActivityHeartbeatByID(
  1821  	ctx context.Context,
  1822  	service workflowserviceclient.Interface,
  1823  	identity string,
  1824  	domain, workflowID, runID, activityID string,
  1825  	details []byte,
  1826  	featureFlags FeatureFlags,
  1827  ) error {
  1828  	request := &s.RecordActivityTaskHeartbeatByIDRequest{
  1829  		Domain:     common.StringPtr(domain),
  1830  		WorkflowID: common.StringPtr(workflowID),
  1831  		RunID:      common.StringPtr(runID),
  1832  		ActivityID: common.StringPtr(activityID),
  1833  		Details:    details,
  1834  		Identity:   common.StringPtr(identity)}
  1835  
  1836  	var heartbeatResponse *s.RecordActivityTaskHeartbeatResponse
  1837  	heartbeatErr := backoff.Retry(ctx,
  1838  		func() error {
  1839  			tchCtx, cancel, opt := newChannelContext(ctx, featureFlags)
  1840  			defer cancel()
  1841  
  1842  			var err error
  1843  			heartbeatResponse, err = service.RecordActivityTaskHeartbeatByID(tchCtx, request, opt...)
  1844  			return err
  1845  		}, createDynamicServiceRetryPolicy(ctx), isServiceTransientError)
  1846  
  1847  	if heartbeatErr == nil && heartbeatResponse != nil && heartbeatResponse.GetCancelRequested() {
  1848  		return NewCanceledError()
  1849  	}
  1850  
  1851  	return heartbeatErr
  1852  }
  1853  
  1854  // This enables verbose logging in the client library.
  1855  // check worker.EnableVerboseLogging()
  1856  func traceLog(fn func()) {
  1857  	if enableVerboseLogging {
  1858  		fn()
  1859  	}
  1860  }
  1861  
  1862  func workflowCategorizedByTimeout(wfContext *workflowExecutionContextImpl) string {
  1863  	executionTimeout := wfContext.workflowInfo.ExecutionStartToCloseTimeoutSeconds
  1864  	if executionTimeout <= defaultInstantLivedWorkflowTimeoutUpperLimitInSec {
  1865  		return "instant"
  1866  	} else if executionTimeout <= defaultShortLivedWorkflowTimeoutUpperLimitInSec {
  1867  		return "short"
  1868  	} else if executionTimeout <= defaultMediumLivedWorkflowTimeoutUpperLimitInSec {
  1869  		return "intermediate"
  1870  	} else {
  1871  		return "long"
  1872  	}
  1873  }