go.uber.org/cadence@v1.2.9/internal/workflow_shadower_activities.go (about) 1 // Copyright (c) 2017-2021 Uber Technologies Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package internal 22 23 import ( 24 "context" 25 "math/rand" 26 "strings" 27 "time" 28 29 "go.uber.org/zap" 30 31 "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" 32 "go.uber.org/cadence/.gen/go/shadower" 33 "go.uber.org/cadence/.gen/go/shared" 34 "go.uber.org/cadence/internal/common" 35 "go.uber.org/cadence/internal/common/backoff" 36 "go.uber.org/cadence/internal/common/metrics" 37 ) 38 39 type ( 40 replayWorkflowActivityProgress struct { 41 Result shadower.ReplayWorkflowActivityResult 42 NextExecutionIdx int 43 } 44 ) 45 46 const ( 47 serviceClientContextKey contextKey = "serviceClient" 48 workflowReplayerContextKey contextKey = "workflowReplayer" 49 ) 50 51 const ( 52 minScanWorkflowResultSize = 10 53 ratioToCompleteScanWorkflow = 0.8 54 scanWorkflowWaitPeriod = 100 * time.Millisecond 55 ) 56 57 func scanWorkflowActivity( 58 ctx context.Context, 59 params shadower.ScanWorkflowActivityParams, 60 ) (shadower.ScanWorkflowActivityResult, error) { 61 logger := GetActivityLogger(ctx) 62 service := ctx.Value(serviceClientContextKey).(workflowserviceclient.Interface) 63 64 scanResult, err := scanWorkflowExecutionsHelper(ctx, service, params, logger) 65 switch err.(type) { 66 case *shared.EntityNotExistsError: 67 err = NewCustomError(shadower.ErrReasonDomainNotExists, err.Error()) 68 case *shared.BadRequestError: 69 err = NewCustomError(shadower.ErrReasonInvalidQuery, err.Error()) 70 } 71 return scanResult, err 72 } 73 74 func scanWorkflowExecutionsHelper( 75 ctx context.Context, 76 service workflowserviceclient.Interface, 77 params shadower.ScanWorkflowActivityParams, 78 logger *zap.Logger, 79 ) (shadower.ScanWorkflowActivityResult, error) { 80 var completionTime time.Time 81 if deadline, ok := ctx.Deadline(); ok { 82 now := time.Now() 83 activityTimeout := deadline.Sub(now) 84 completionTime = now.Add(time.Duration(ratioToCompleteScanWorkflow * float32(activityTimeout))) 85 } 86 87 request := &shared.ListWorkflowExecutionsRequest{ 88 Domain: params.Domain, 89 Query: params.WorkflowQuery, 90 NextPageToken: params.NextPageToken, 91 PageSize: params.PageSize, 92 } 93 94 result := shadower.ScanWorkflowActivityResult{} 95 for { 96 var resp *shared.ListWorkflowExecutionsResponse 97 if err := backoff.Retry(ctx, 98 func() error { 99 tchCtx, cancel, opt := newChannelContext(ctx, FeatureFlags{}) 100 101 var err error 102 resp, err = service.ScanWorkflowExecutions(tchCtx, request, opt...) 103 cancel() 104 105 return err 106 }, 107 createDynamicServiceRetryPolicy(ctx), 108 isServiceTransientError, 109 ); err != nil { 110 logger.Error("Failed to scan workflow executions", 111 zap.String(tagDomain, params.GetDomain()), 112 zap.String(tagVisibilityQuery, params.GetWorkflowQuery()), 113 zap.Error(err), 114 ) 115 return shadower.ScanWorkflowActivityResult{}, err 116 } 117 118 for _, execution := range resp.Executions { 119 if shouldReplay(params.GetSamplingRate()) { 120 result.Executions = append(result.Executions, execution.Execution) 121 } 122 } 123 124 request.NextPageToken = resp.NextPageToken 125 if len(request.NextPageToken) == 0 || 126 len(result.Executions) >= minScanWorkflowResultSize || 127 (!completionTime.IsZero() && time.Now().After(completionTime)) { 128 result.NextPageToken = request.NextPageToken 129 break 130 } 131 132 time.Sleep(scanWorkflowWaitPeriod) 133 } 134 135 return result, nil 136 } 137 138 func shouldReplay(probability float64) bool { 139 if probability == 0 { 140 return true 141 } 142 143 return rand.Float64() <= probability 144 } 145 146 func replayWorkflowActivity( 147 ctx context.Context, 148 params shadower.ReplayWorkflowActivityParams, 149 ) (shadower.ReplayWorkflowActivityResult, error) { 150 logger := GetActivityLogger(ctx) 151 scope := tagScope(GetActivityMetricsScope(ctx), tagDomain, params.GetDomain(), tagTaskList, GetActivityInfo(ctx).TaskList) 152 service := ctx.Value(serviceClientContextKey).(workflowserviceclient.Interface) 153 replayer := ctx.Value(workflowReplayerContextKey).(*WorkflowReplayer) 154 155 var progress replayWorkflowActivityProgress 156 if err := GetHeartbeatDetails(ctx, &progress); err != nil { 157 progress = replayWorkflowActivityProgress{ 158 NextExecutionIdx: 0, 159 Result: shadower.ReplayWorkflowActivityResult{ 160 Succeeded: common.Int32Ptr(0), 161 Skipped: common.Int32Ptr(0), 162 Failed: common.Int32Ptr(0), 163 }, 164 } 165 } 166 167 // following code assumes all pointers in progress.Result are not nil, this is ensured by: 168 // 1. if not previous progress, init to pointer to 0 169 // 2. if has previous progress, the progress uploaded during heartbeat has non nil pointers 170 171 for _, execution := range params.Executions[progress.NextExecutionIdx:] { 172 if execution == nil { 173 continue 174 } 175 176 sw := scope.Timer(metrics.ReplayLatency).Start() 177 success, err := replayWorkflowExecutionHelper(ctx, replayer, service, logger, params.GetDomain(), WorkflowExecution{ 178 ID: execution.GetWorkflowId(), 179 RunID: execution.GetRunId(), 180 }) 181 if err != nil { 182 scope.Counter(metrics.ReplayFailedCounter).Inc(1) 183 *progress.Result.Failed++ 184 if isWorkflowTypeNotRegisteredError(err) { 185 // this should fail the replay workflow as it requires worker deployment to fix the workflow registration. 186 return progress.Result, NewCustomError(shadower.ErrReasonWorkflowTypeNotRegistered, err.Error()) 187 } 188 } else if success { 189 scope.Counter(metrics.ReplaySucceedCounter).Inc(1) 190 *progress.Result.Succeeded++ 191 } else { 192 scope.Counter(metrics.ReplaySkippedCounter).Inc(1) 193 *progress.Result.Skipped++ 194 } 195 sw.Stop() 196 197 progress.NextExecutionIdx++ 198 RecordActivityHeartbeat(ctx, progress) 199 } 200 201 return progress.Result, nil 202 } 203 204 func replayWorkflowExecutionHelper( 205 ctx context.Context, 206 replayer *WorkflowReplayer, 207 service workflowserviceclient.Interface, 208 logger *zap.Logger, 209 domain string, 210 execution WorkflowExecution, 211 ) (bool, error) { 212 taggedLogger := logger.With( 213 zap.String(tagWorkflowID, execution.ID), 214 zap.String(tagRunID, execution.RunID), 215 ) 216 217 err := replayer.ReplayWorkflowExecution(ctx, service, logger, domain, execution) 218 if err == nil { 219 taggedLogger.Info("Successfully replayed workflow") 220 return true, nil 221 } 222 223 if isNondeterministicErr(err) || isWorkflowTypeNotRegisteredError(err) { 224 taggedLogger.Error("Replay workflow failed", zap.Error(err)) 225 return false, err 226 } 227 228 taggedLogger.Info("Skipped replaying workflow", zap.Error(err)) 229 return false, nil 230 } 231 232 func isNondeterministicErr(err error) bool { 233 // There're a few expected replay errors, for example: 234 // 1. errReplayHistoryTooShort 235 // 2. workflow not exist 236 // 3. internal service error when reading workflow history 237 // since we can't get an exhaustive list of expected errors, we only treat replay as failed 238 // when we are sure the error is due to non-determinisim to make sure there's no false positive. 239 // as shadowing doesn't guarantee to catch all nondeterministic errors. 240 return strings.Contains(err.Error(), "nondeterministic") 241 } 242 243 func isWorkflowTypeNotRegisteredError(err error) bool { 244 return strings.Contains(err.Error(), errMsgUnknownWorkflowType) 245 }