go.uber.org/cadence@v1.2.9/internal/workflow_shadower_activities.go (about)

     1  // Copyright (c) 2017-2021 Uber Technologies Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package internal
    22  
    23  import (
    24  	"context"
    25  	"math/rand"
    26  	"strings"
    27  	"time"
    28  
    29  	"go.uber.org/zap"
    30  
    31  	"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
    32  	"go.uber.org/cadence/.gen/go/shadower"
    33  	"go.uber.org/cadence/.gen/go/shared"
    34  	"go.uber.org/cadence/internal/common"
    35  	"go.uber.org/cadence/internal/common/backoff"
    36  	"go.uber.org/cadence/internal/common/metrics"
    37  )
    38  
    39  type (
    40  	replayWorkflowActivityProgress struct {
    41  		Result           shadower.ReplayWorkflowActivityResult
    42  		NextExecutionIdx int
    43  	}
    44  )
    45  
    46  const (
    47  	serviceClientContextKey    contextKey = "serviceClient"
    48  	workflowReplayerContextKey contextKey = "workflowReplayer"
    49  )
    50  
    51  const (
    52  	minScanWorkflowResultSize   = 10
    53  	ratioToCompleteScanWorkflow = 0.8
    54  	scanWorkflowWaitPeriod      = 100 * time.Millisecond
    55  )
    56  
    57  func scanWorkflowActivity(
    58  	ctx context.Context,
    59  	params shadower.ScanWorkflowActivityParams,
    60  ) (shadower.ScanWorkflowActivityResult, error) {
    61  	logger := GetActivityLogger(ctx)
    62  	service := ctx.Value(serviceClientContextKey).(workflowserviceclient.Interface)
    63  
    64  	scanResult, err := scanWorkflowExecutionsHelper(ctx, service, params, logger)
    65  	switch err.(type) {
    66  	case *shared.EntityNotExistsError:
    67  		err = NewCustomError(shadower.ErrReasonDomainNotExists, err.Error())
    68  	case *shared.BadRequestError:
    69  		err = NewCustomError(shadower.ErrReasonInvalidQuery, err.Error())
    70  	}
    71  	return scanResult, err
    72  }
    73  
    74  func scanWorkflowExecutionsHelper(
    75  	ctx context.Context,
    76  	service workflowserviceclient.Interface,
    77  	params shadower.ScanWorkflowActivityParams,
    78  	logger *zap.Logger,
    79  ) (shadower.ScanWorkflowActivityResult, error) {
    80  	var completionTime time.Time
    81  	if deadline, ok := ctx.Deadline(); ok {
    82  		now := time.Now()
    83  		activityTimeout := deadline.Sub(now)
    84  		completionTime = now.Add(time.Duration(ratioToCompleteScanWorkflow * float32(activityTimeout)))
    85  	}
    86  
    87  	request := &shared.ListWorkflowExecutionsRequest{
    88  		Domain:        params.Domain,
    89  		Query:         params.WorkflowQuery,
    90  		NextPageToken: params.NextPageToken,
    91  		PageSize:      params.PageSize,
    92  	}
    93  
    94  	result := shadower.ScanWorkflowActivityResult{}
    95  	for {
    96  		var resp *shared.ListWorkflowExecutionsResponse
    97  		if err := backoff.Retry(ctx,
    98  			func() error {
    99  				tchCtx, cancel, opt := newChannelContext(ctx, FeatureFlags{})
   100  
   101  				var err error
   102  				resp, err = service.ScanWorkflowExecutions(tchCtx, request, opt...)
   103  				cancel()
   104  
   105  				return err
   106  			},
   107  			createDynamicServiceRetryPolicy(ctx),
   108  			isServiceTransientError,
   109  		); err != nil {
   110  			logger.Error("Failed to scan workflow executions",
   111  				zap.String(tagDomain, params.GetDomain()),
   112  				zap.String(tagVisibilityQuery, params.GetWorkflowQuery()),
   113  				zap.Error(err),
   114  			)
   115  			return shadower.ScanWorkflowActivityResult{}, err
   116  		}
   117  
   118  		for _, execution := range resp.Executions {
   119  			if shouldReplay(params.GetSamplingRate()) {
   120  				result.Executions = append(result.Executions, execution.Execution)
   121  			}
   122  		}
   123  
   124  		request.NextPageToken = resp.NextPageToken
   125  		if len(request.NextPageToken) == 0 ||
   126  			len(result.Executions) >= minScanWorkflowResultSize ||
   127  			(!completionTime.IsZero() && time.Now().After(completionTime)) {
   128  			result.NextPageToken = request.NextPageToken
   129  			break
   130  		}
   131  
   132  		time.Sleep(scanWorkflowWaitPeriod)
   133  	}
   134  
   135  	return result, nil
   136  }
   137  
   138  func shouldReplay(probability float64) bool {
   139  	if probability == 0 {
   140  		return true
   141  	}
   142  
   143  	return rand.Float64() <= probability
   144  }
   145  
   146  func replayWorkflowActivity(
   147  	ctx context.Context,
   148  	params shadower.ReplayWorkflowActivityParams,
   149  ) (shadower.ReplayWorkflowActivityResult, error) {
   150  	logger := GetActivityLogger(ctx)
   151  	scope := tagScope(GetActivityMetricsScope(ctx), tagDomain, params.GetDomain(), tagTaskList, GetActivityInfo(ctx).TaskList)
   152  	service := ctx.Value(serviceClientContextKey).(workflowserviceclient.Interface)
   153  	replayer := ctx.Value(workflowReplayerContextKey).(*WorkflowReplayer)
   154  
   155  	var progress replayWorkflowActivityProgress
   156  	if err := GetHeartbeatDetails(ctx, &progress); err != nil {
   157  		progress = replayWorkflowActivityProgress{
   158  			NextExecutionIdx: 0,
   159  			Result: shadower.ReplayWorkflowActivityResult{
   160  				Succeeded: common.Int32Ptr(0),
   161  				Skipped:   common.Int32Ptr(0),
   162  				Failed:    common.Int32Ptr(0),
   163  			},
   164  		}
   165  	}
   166  
   167  	// following code assumes all pointers in progress.Result are not nil, this is ensured by:
   168  	//   1. if not previous progress, init to pointer to 0
   169  	//   2. if has previous progress, the progress uploaded during heartbeat has non nil pointers
   170  
   171  	for _, execution := range params.Executions[progress.NextExecutionIdx:] {
   172  		if execution == nil {
   173  			continue
   174  		}
   175  
   176  		sw := scope.Timer(metrics.ReplayLatency).Start()
   177  		success, err := replayWorkflowExecutionHelper(ctx, replayer, service, logger, params.GetDomain(), WorkflowExecution{
   178  			ID:    execution.GetWorkflowId(),
   179  			RunID: execution.GetRunId(),
   180  		})
   181  		if err != nil {
   182  			scope.Counter(metrics.ReplayFailedCounter).Inc(1)
   183  			*progress.Result.Failed++
   184  			if isWorkflowTypeNotRegisteredError(err) {
   185  				// this should fail the replay workflow as it requires worker deployment to fix the workflow registration.
   186  				return progress.Result, NewCustomError(shadower.ErrReasonWorkflowTypeNotRegistered, err.Error())
   187  			}
   188  		} else if success {
   189  			scope.Counter(metrics.ReplaySucceedCounter).Inc(1)
   190  			*progress.Result.Succeeded++
   191  		} else {
   192  			scope.Counter(metrics.ReplaySkippedCounter).Inc(1)
   193  			*progress.Result.Skipped++
   194  		}
   195  		sw.Stop()
   196  
   197  		progress.NextExecutionIdx++
   198  		RecordActivityHeartbeat(ctx, progress)
   199  	}
   200  
   201  	return progress.Result, nil
   202  }
   203  
   204  func replayWorkflowExecutionHelper(
   205  	ctx context.Context,
   206  	replayer *WorkflowReplayer,
   207  	service workflowserviceclient.Interface,
   208  	logger *zap.Logger,
   209  	domain string,
   210  	execution WorkflowExecution,
   211  ) (bool, error) {
   212  	taggedLogger := logger.With(
   213  		zap.String(tagWorkflowID, execution.ID),
   214  		zap.String(tagRunID, execution.RunID),
   215  	)
   216  
   217  	err := replayer.ReplayWorkflowExecution(ctx, service, logger, domain, execution)
   218  	if err == nil {
   219  		taggedLogger.Info("Successfully replayed workflow")
   220  		return true, nil
   221  	}
   222  
   223  	if isNondeterministicErr(err) || isWorkflowTypeNotRegisteredError(err) {
   224  		taggedLogger.Error("Replay workflow failed", zap.Error(err))
   225  		return false, err
   226  	}
   227  
   228  	taggedLogger.Info("Skipped replaying workflow", zap.Error(err))
   229  	return false, nil
   230  }
   231  
   232  func isNondeterministicErr(err error) bool {
   233  	// There're a few expected replay errors, for example:
   234  	//   1. errReplayHistoryTooShort
   235  	//   2. workflow not exist
   236  	//   3. internal service error when reading workflow history
   237  	// since we can't get an exhaustive list of expected errors, we only treat replay as failed
   238  	// when we are sure the error is due to non-determinisim to make sure there's no false positive.
   239  	// as shadowing doesn't guarantee to catch all nondeterministic errors.
   240  	return strings.Contains(err.Error(), "nondeterministic")
   241  }
   242  
   243  func isWorkflowTypeNotRegisteredError(err error) bool {
   244  	return strings.Contains(err.Error(), errMsgUnknownWorkflowType)
   245  }