go.uber.org/cadence@v1.2.9/internal/workflow_replayer.go (about)

     1  // Copyright (c) 2017-2020 Uber Technologies Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package internal
    22  
    23  import (
    24  	"bytes"
    25  	"context"
    26  	"encoding/json"
    27  	"errors"
    28  	"fmt"
    29  	"io"
    30  	"math"
    31  	"os"
    32  
    33  	"github.com/golang/mock/gomock"
    34  	"github.com/opentracing/opentracing-go"
    35  	"github.com/pborman/uuid"
    36  	"github.com/uber-go/tally"
    37  	"go.uber.org/zap"
    38  
    39  	"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
    40  	"go.uber.org/cadence/.gen/go/cadence/workflowservicetest"
    41  	"go.uber.org/cadence/.gen/go/shared"
    42  	"go.uber.org/cadence/internal/common"
    43  	"go.uber.org/cadence/internal/common/backoff"
    44  	"go.uber.org/cadence/internal/common/serializer"
    45  )
    46  
    47  const (
    48  	replayDomainName             = "ReplayDomain"
    49  	replayTaskListName           = "ReplayTaskList"
    50  	replayWorkflowID             = "ReplayId"
    51  	replayWorkerIdentity         = "replayID"
    52  	replayPreviousStartedEventID = math.MaxInt64
    53  	replayTaskToken              = "ReplayTaskToken"
    54  )
    55  
    56  var (
    57  	errReplayEmptyHistory          = errors.New("empty events")
    58  	errReplayHistoryTooShort       = errors.New("at least 3 events expected in the history")
    59  	errReplayInvalidFirstEvent     = errors.New("first event is not WorkflowExecutionStarted")
    60  	errReplayCorruptedStartedEvent = errors.New("corrupted WorkflowExecutionStarted")
    61  )
    62  
    63  // WorkflowReplayer is used to replay workflow code from an event history
    64  type WorkflowReplayer struct {
    65  	registry *registry
    66  	options  ReplayOptions
    67  }
    68  
    69  // ReplayOptions is used to configure the replay decision task worker.
    70  type ReplayOptions struct {
    71  	// Optional: Sets DataConverter to customize serialization/deserialization of arguments in Cadence
    72  	// default: defaultDataConverter, an combination of thriftEncoder and jsonEncoder
    73  	DataConverter DataConverter
    74  
    75  	// Optional: Specifies factories used to instantiate workflow interceptor chain
    76  	// The chain is instantiated per each replay of a workflow execution
    77  	WorkflowInterceptorChainFactories []WorkflowInterceptorFactory
    78  
    79  	// Optional: Sets ContextPropagators that allows users to control the context information passed through a workflow
    80  	// default: no ContextPropagators
    81  	ContextPropagators []ContextPropagator
    82  
    83  	// Optional: Sets opentracing Tracer that is to be used to emit tracing information
    84  	// default: no tracer - opentracing.NoopTracer
    85  	Tracer opentracing.Tracer
    86  
    87  	// Optional: flags to turn on/off some features on server side
    88  	// default: all features under the struct is turned off
    89  	FeatureFlags FeatureFlags
    90  }
    91  
    92  // IsReplayDomain checks if the domainName is from replay
    93  func IsReplayDomain(dn string) bool {
    94  	return replayDomainName == dn
    95  }
    96  
    97  // NewWorkflowReplayer creates an instance of the WorkflowReplayer
    98  func NewWorkflowReplayer() *WorkflowReplayer {
    99  	return NewWorkflowReplayerWithOptions(ReplayOptions{})
   100  }
   101  
   102  // NewWorkflowReplayerWithOptions creates an instance of the WorkflowReplayer
   103  // with provided replay worker options
   104  func NewWorkflowReplayerWithOptions(
   105  	options ReplayOptions,
   106  ) *WorkflowReplayer {
   107  	augmentReplayOptions(&options)
   108  	return &WorkflowReplayer{
   109  		registry: newRegistry(),
   110  		options:  options,
   111  	}
   112  }
   113  
   114  // RegisterWorkflow registers workflow function to replay
   115  func (r *WorkflowReplayer) RegisterWorkflow(w interface{}) {
   116  	r.registry.RegisterWorkflow(w)
   117  }
   118  
   119  // RegisterWorkflowWithOptions registers workflow function with custom workflow name to replay
   120  func (r *WorkflowReplayer) RegisterWorkflowWithOptions(w interface{}, options RegisterWorkflowOptions) {
   121  	r.registry.RegisterWorkflowWithOptions(w, options)
   122  }
   123  
   124  // RegisterActivity registers an activity function for this replayer
   125  func (r *WorkflowReplayer) RegisterActivity(a interface{}) {
   126  	r.registry.RegisterActivity(a)
   127  }
   128  
   129  // RegisterActivityWithOptions registers an activity function for this replayer with custom options, e.g. an explicit name.
   130  func (r *WorkflowReplayer) RegisterActivityWithOptions(a interface{}, options RegisterActivityOptions) {
   131  	r.registry.RegisterActivityWithOptions(a, options)
   132  }
   133  
   134  // ReplayWorkflowHistory executes a single decision task for the given history.
   135  // Use for testing backwards compatibility of code changes and troubleshooting workflows in a debugger.
   136  // The logger is an optional parameter. Defaults to the noop logger.
   137  func (r *WorkflowReplayer) ReplayWorkflowHistory(logger *zap.Logger, history *shared.History) error {
   138  	if logger == nil {
   139  		logger = zap.NewNop()
   140  	}
   141  
   142  	testReporter := logger.Sugar()
   143  	controller := gomock.NewController(testReporter)
   144  	service := workflowservicetest.NewMockClient(controller)
   145  
   146  	return r.replayWorkflowHistory(logger, service, replayDomainName, nil, history, nil)
   147  }
   148  
   149  func (r *WorkflowReplayer) ReplayWorkflowHistoryFromJSON(logger *zap.Logger, reader io.Reader) error {
   150  	return r.ReplayPartialWorkflowHistoryFromJSON(logger, reader, 0)
   151  }
   152  
   153  func (r *WorkflowReplayer) ReplayPartialWorkflowHistoryFromJSON(logger *zap.Logger, reader io.Reader, lastEventID int64) error {
   154  	history, err := extractHistoryFromReader(reader, lastEventID)
   155  
   156  	if err != nil {
   157  		return err
   158  	}
   159  
   160  	if logger == nil {
   161  		logger = zap.NewNop()
   162  	}
   163  
   164  	testReporter := logger.Sugar()
   165  	controller := gomock.NewController(testReporter)
   166  	service := workflowservicetest.NewMockClient(controller)
   167  
   168  	return r.replayWorkflowHistory(logger, service, replayDomainName, nil, history, nil)
   169  }
   170  
   171  // ReplayWorkflowHistoryFromJSONFile executes a single decision task for the given json history file.
   172  // Use for testing the backwards compatibility of code changes and troubleshooting workflows in a debugger.
   173  // The logger is an optional parameter. Defaults to the noop logger.
   174  func (r *WorkflowReplayer) ReplayWorkflowHistoryFromJSONFile(logger *zap.Logger, jsonfileName string) error {
   175  	return r.ReplayPartialWorkflowHistoryFromJSONFile(logger, jsonfileName, 0)
   176  }
   177  
   178  // ReplayPartialWorkflowHistoryFromJSONFile executes a single decision task for the given json history file up to provided
   179  // lastEventID(inclusive).
   180  // Use for testing backwards compatibility of code changes and troubleshooting workflows in a debugger.
   181  // The logger is an optional parameter. Defaults to the noop logger.
   182  func (r *WorkflowReplayer) ReplayPartialWorkflowHistoryFromJSONFile(logger *zap.Logger, jsonfileName string, lastEventID int64) error {
   183  	file, err := os.Open(jsonfileName)
   184  	if err != nil {
   185  		return fmt.Errorf("could not open file: %w", err)
   186  	}
   187  	defer func() {
   188  		_ = file.Close()
   189  	}()
   190  	return r.ReplayPartialWorkflowHistoryFromJSON(logger, file, lastEventID)
   191  }
   192  
   193  // ReplayWorkflowExecution replays workflow execution loading it from Cadence service.
   194  // The logger is an optional parameter. Defaults to the noop logger.
   195  func (r *WorkflowReplayer) ReplayWorkflowExecution(
   196  	ctx context.Context,
   197  	service workflowserviceclient.Interface,
   198  	logger *zap.Logger,
   199  	domain string,
   200  	execution WorkflowExecution,
   201  ) error {
   202  	sharedExecution := &shared.WorkflowExecution{
   203  		RunId:      common.StringPtr(execution.RunID),
   204  		WorkflowId: common.StringPtr(execution.ID),
   205  	}
   206  	request := &shared.GetWorkflowExecutionHistoryRequest{
   207  		Domain:    common.StringPtr(domain),
   208  		Execution: sharedExecution,
   209  	}
   210  
   211  	var hResponse *shared.GetWorkflowExecutionHistoryResponse
   212  	if err := backoff.Retry(ctx,
   213  		func() error {
   214  			tchCtx, cancel, opt := newChannelContext(ctx, r.options.FeatureFlags)
   215  
   216  			var err error
   217  			hResponse, err = service.GetWorkflowExecutionHistory(tchCtx, request, opt...)
   218  			cancel()
   219  
   220  			return err
   221  		},
   222  		createDynamicServiceRetryPolicy(ctx),
   223  		func(err error) bool {
   224  			if _, ok := err.(*shared.InternalServiceError); ok {
   225  				// treat InternalServiceError as non-retryable, as the workflow history may be corrupted
   226  				return false
   227  			}
   228  			return isServiceTransientError(err)
   229  		},
   230  	); err != nil {
   231  		return err
   232  	}
   233  
   234  	if hResponse.RawHistory != nil {
   235  		history, err := serializer.DeserializeBlobDataToHistoryEvents(hResponse.RawHistory, shared.HistoryEventFilterTypeAllEvent)
   236  		if err != nil {
   237  			return err
   238  		}
   239  
   240  		hResponse.History = history
   241  	}
   242  
   243  	return r.replayWorkflowHistory(logger, service, domain, &execution, hResponse.History, hResponse.NextPageToken)
   244  }
   245  
   246  func (r *WorkflowReplayer) replayWorkflowHistory(
   247  	logger *zap.Logger,
   248  	service workflowserviceclient.Interface,
   249  	domain string,
   250  	execution *WorkflowExecution,
   251  	history *shared.History,
   252  	nextPageToken []byte,
   253  ) error {
   254  	events := history.Events
   255  	if events == nil {
   256  		return errReplayEmptyHistory
   257  	}
   258  	if len(events) < 3 {
   259  		return errReplayHistoryTooShort
   260  	}
   261  	first := events[0]
   262  	if first.GetEventType() != shared.EventTypeWorkflowExecutionStarted {
   263  		return errReplayInvalidFirstEvent
   264  	}
   265  	last := events[len(events)-1]
   266  
   267  	attr := first.WorkflowExecutionStartedEventAttributes
   268  	if attr == nil {
   269  		return errReplayCorruptedStartedEvent
   270  	}
   271  	workflowType := attr.WorkflowType
   272  	if execution == nil {
   273  		execution = &WorkflowExecution{
   274  			ID:    replayWorkflowID,
   275  			RunID: uuid.NewRandom().String(),
   276  		}
   277  		if first.WorkflowExecutionStartedEventAttributes.GetOriginalExecutionRunId() != "" {
   278  			execution.RunID = first.WorkflowExecutionStartedEventAttributes.GetOriginalExecutionRunId()
   279  		}
   280  	}
   281  
   282  	task := &shared.PollForDecisionTaskResponse{
   283  		Attempt:      common.Int64Ptr(int64(attr.GetAttempt())),
   284  		TaskToken:    []byte(replayTaskToken),
   285  		WorkflowType: workflowType,
   286  		WorkflowExecution: &shared.WorkflowExecution{
   287  			WorkflowId: common.StringPtr(execution.ID),
   288  			RunId:      common.StringPtr(execution.RunID),
   289  		},
   290  		History:                history,
   291  		PreviousStartedEventId: common.Int64Ptr(replayPreviousStartedEventID),
   292  		NextPageToken:          nextPageToken,
   293  	}
   294  	if logger == nil {
   295  		logger = zap.NewNop()
   296  	}
   297  	workerParams := workerExecutionParameters{
   298  		WorkerOptions: WorkerOptions{
   299  			Identity:                          replayWorkerIdentity,
   300  			DataConverter:                     r.options.DataConverter,
   301  			ContextPropagators:                r.options.ContextPropagators,
   302  			WorkflowInterceptorChainFactories: r.options.WorkflowInterceptorChainFactories,
   303  			Tracer:                            r.options.Tracer,
   304  			Logger:                            logger,
   305  			DisableStickyExecution:            true,
   306  		},
   307  		TaskList: replayTaskListName,
   308  	}
   309  
   310  	metricScope := tally.NoopScope
   311  	iterator := &historyIteratorImpl{
   312  		nextPageToken:  task.NextPageToken,
   313  		execution:      task.WorkflowExecution,
   314  		domain:         domain,
   315  		service:        service,
   316  		metricsScope:   metricScope,
   317  		startedEventID: task.GetStartedEventId(),
   318  		featureFlags:   r.options.FeatureFlags,
   319  	}
   320  	taskHandler := newWorkflowTaskHandler(domain, workerParams, nil, r.registry)
   321  	resp, err := taskHandler.ProcessWorkflowTask(&workflowTask{task: task, historyIterator: iterator}, nil)
   322  	if err != nil {
   323  		return err
   324  	}
   325  
   326  	if last.GetEventType() != shared.EventTypeWorkflowExecutionCompleted && last.GetEventType() != shared.EventTypeWorkflowExecutionContinuedAsNew {
   327  		return nil
   328  	}
   329  
   330  	// TODO: the following result will not be executed if nextPageToken is not nil, which is probably fine as the actual workflow task
   331  	// processing logic does not have such check. If we want to always execute this check for closed workflows, we need to dump the
   332  	// entire history before starting the replay as otherwise we can't get the last event here.
   333  	// compare workflow results
   334  	if resp != nil {
   335  		completeReq, ok := resp.(*shared.RespondDecisionTaskCompletedRequest)
   336  		if ok {
   337  			for _, d := range completeReq.Decisions {
   338  				if d.GetDecisionType() == shared.DecisionTypeContinueAsNewWorkflowExecution &&
   339  					last.GetEventType() == shared.EventTypeWorkflowExecutionContinuedAsNew {
   340  					inputA := d.ContinueAsNewWorkflowExecutionDecisionAttributes.Input
   341  					inputB := last.WorkflowExecutionContinuedAsNewEventAttributes.Input
   342  					if bytes.Compare(inputA, inputB) == 0 {
   343  						return nil
   344  					}
   345  				}
   346  				if d.GetDecisionType() == shared.DecisionTypeCompleteWorkflowExecution &&
   347  					last.GetEventType() == shared.EventTypeWorkflowExecutionCompleted {
   348  					resultA := last.WorkflowExecutionCompletedEventAttributes.Result
   349  					resultB := d.CompleteWorkflowExecutionDecisionAttributes.Result
   350  					if bytes.Compare(resultA, resultB) == 0 {
   351  						return nil
   352  					}
   353  				}
   354  				if d.GetDecisionType() == shared.DecisionTypeCompleteWorkflowExecution &&
   355  					last.GetEventType() == shared.EventTypeWorkflowExecutionContinuedAsNew {
   356  					// for cron and retry workflow, decision will be completed workflow and
   357  					// and server side will convert it to a continue as new event.
   358  					// there's nothing to compare here
   359  					return nil
   360  				}
   361  			}
   362  		}
   363  	}
   364  	return fmt.Errorf("replay workflow doesn't return the same result as the last event, resp: %v, last: %v", resp, last)
   365  }
   366  
   367  func extractHistoryFromReader(r io.Reader, lastEventID int64) (*shared.History, error) {
   368  	raw, err := io.ReadAll(r)
   369  	if err != nil {
   370  		return nil, fmt.Errorf("failed to read data: %w", err)
   371  	}
   372  
   373  	var deserializedEvents []*shared.HistoryEvent
   374  	err = json.Unmarshal(raw, &deserializedEvents)
   375  
   376  	if err != nil {
   377  		return nil, fmt.Errorf("invalid json contents: %w", err)
   378  	}
   379  
   380  	if lastEventID <= 0 {
   381  		return &shared.History{Events: deserializedEvents}, nil
   382  	}
   383  
   384  	// Caller is potentially asking for subset of history instead of all history events
   385  	var events []*shared.HistoryEvent
   386  	for _, event := range deserializedEvents {
   387  		events = append(events, event)
   388  		if event.GetEventId() == lastEventID {
   389  			// Copy history up to last event (inclusive)
   390  			break
   391  		}
   392  	}
   393  
   394  	return &shared.History{Events: events}, nil
   395  }
   396  
   397  func augmentReplayOptions(
   398  	options *ReplayOptions,
   399  ) {
   400  	// if the user passes in a tracer then add a tracing context propagator
   401  	if options.Tracer != nil {
   402  		options.ContextPropagators = append(options.ContextPropagators, NewTracingContextPropagator(zap.NewNop(), options.Tracer))
   403  	} else {
   404  		options.Tracer = opentracing.NoopTracer{}
   405  	}
   406  }