go.uber.org/cadence@v1.2.9/internal/workflow_shadower.go (about)

     1  // Copyright (c) 2017-2021 Uber Technologies Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package internal
    22  
    23  import (
    24  	"context"
    25  	"errors"
    26  	"fmt"
    27  	"math"
    28  	"math/rand"
    29  	"sync"
    30  	"sync/atomic"
    31  	"time"
    32  
    33  	"github.com/facebookgo/clock"
    34  	"go.uber.org/zap"
    35  
    36  	"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
    37  	"go.uber.org/cadence/.gen/go/shadower"
    38  	"go.uber.org/cadence/internal/common"
    39  	"go.uber.org/cadence/internal/common/util"
    40  )
    41  
    42  const (
    43  	statusInitialized int32 = 0
    44  	statusStarted     int32 = 1
    45  	statusStopped     int32 = 2
    46  )
    47  
    48  const (
    49  	defaultWaitDurationPerIteration = 5 * time.Minute
    50  )
    51  
    52  type (
    53  	// ShadowOptions is used to configure workflow shadowing.
    54  	ShadowOptions struct {
    55  		// Optional: Workflow visibility query for getting workflows that should be replayed
    56  		// if specified, WorkflowTypes, WorkflowStatus, WorkflowStartTimeFilter fields must not be specified.
    57  		// default: empty query, which matches all workflows
    58  		WorkflowQuery string
    59  
    60  		// Optional: A list of workflow type names.
    61  		// The list will be used to construct WorkflowQuery. Only workflows with types listed will be replayed.
    62  		// default: empty list, which matches all workflow types
    63  		WorkflowTypes []string
    64  
    65  		// Optional: A list of workflow status.
    66  		// The list will be used to construct WorkflowQuery. Only workflows with status listed will be replayed.
    67  		// accepted values (case-insensitive): OPEN, CLOSED, ALL, COMPLETED, FAILED, CANCELED, TERMINATED, CONTINUED_AS_NEW, TIMED_OUT
    68  		// default: OPEN, which matches only open workflows
    69  		WorkflowStatus []string
    70  
    71  		// Optional: Min and Max workflow start timestamp.
    72  		// Timestamps will be used to construct WorkflowQuery. Only workflows started within the time range will be replayed.
    73  		// default: no time filter, which matches all workflow start timestamp
    74  		WorkflowStartTimeFilter TimeFilter
    75  
    76  		// Optional: Min and Max workflow close timestamp.
    77  		// Timestamps will be used to construct WorkflowQuery. Only workflows closed within the time range will be replayed. If this filter is being used along with the thee StartTime filter then make sure the Min Close time stamp
    78  		// is within the range of Start timestamp.
    79  		// default: no time filter, which matches all workflow closed timestamp
    80  		WorkflowCloseTimeFilter TimeFilter
    81  
    82  		// Optional: sampling rate for the workflows matches WorkflowQuery
    83  		// only sampled workflows will be replayed
    84  		// default: 1.0
    85  		SamplingRate float64
    86  
    87  		// Optional: sets if shadowing should continue after all workflows matches the WorkflowQuery have been replayed.
    88  		// If set to ShadowModeContinuous, ExitCondition must be specified.
    89  		// default: ShadowModeNormal, which means shadowing will complete after all workflows have been replayed
    90  		Mode ShadowMode
    91  
    92  		// Required if Mode is set to ShadowModeContinuous: controls when shadowing should complete
    93  		ExitCondition ShadowExitCondition
    94  
    95  		// Optional: workflow shadowing concurrency (# of concurrent workflow replay activities)
    96  		// Note: this field only applies to shadow worker. For the local WorkflowShadower,
    97  		// the concurrency will always be 1.
    98  		// An error will be returned if it's set to be larger than 1 when used to NewWorkflowShadower
    99  		// default: 1
   100  		Concurrency int
   101  	}
   102  
   103  	// TimeFilter represents a time range through the min and max timestamp
   104  	TimeFilter struct {
   105  		MinTimestamp time.Time
   106  		MaxTimestamp time.Time
   107  	}
   108  
   109  	// ShadowMode is an enum for configuring if shadowing should continue after all workflows matches the WorkflowQuery have been replayed.
   110  	ShadowMode int
   111  
   112  	// ShadowExitCondition configures when the workflow shadower should exit.
   113  	// If not specified shadower will exit after replaying all workflows satisfying the visibility query.
   114  	ShadowExitCondition struct {
   115  		// Optional: Expiration interval for shadowing.
   116  		// Shadowing will exit when this interval has passed.
   117  		// default: no expiration interval
   118  		ExpirationInterval time.Duration
   119  		// Optional: Target number of shadowed workflows.
   120  		// Shadowing will exit after this number is reached.
   121  		// default: no limit on shadow count
   122  		ShadowCount int
   123  	}
   124  
   125  	// WorkflowShadower retrieves and replays workflow history from Cadence service
   126  	// to determine if there's any nondeterministic changes in the workflow definition
   127  	WorkflowShadower struct {
   128  		service       workflowserviceclient.Interface
   129  		domain        string
   130  		shadowOptions ShadowOptions
   131  		logger        *zap.Logger
   132  		replayer      *WorkflowReplayer
   133  
   134  		status     int32
   135  		shutdownCh chan struct{}
   136  		shutdownWG sync.WaitGroup
   137  
   138  		clock clock.Clock
   139  	}
   140  )
   141  
   142  const (
   143  	// ShadowModeNormal is the default mode for workflow shadowing.
   144  	// Shadowing will complete after all workflows matches WorkflowQuery have been replayed.
   145  	ShadowModeNormal ShadowMode = iota
   146  	// ShadowModeContinuous mode will start a new round of shadowing
   147  	// after all workflows matches WorkflowQuery have been replayed.
   148  	// There will be a 5 min wait period between each round,
   149  	// currently this wait period is not configurable.
   150  	// Shadowing will complete only when ExitCondition is met.
   151  	// ExitCondition must be specified when using this mode
   152  	ShadowModeContinuous
   153  )
   154  
   155  // NewWorkflowShadower creates an instance of the WorkflowShadower for testing
   156  // The logger is an optional parameter. Defaults to noop logger if not provided and will override the logger in WorkerOptions
   157  func NewWorkflowShadower(
   158  	service workflowserviceclient.Interface,
   159  	domain string,
   160  	shadowOptions ShadowOptions,
   161  	replayOptions ReplayOptions,
   162  	logger *zap.Logger,
   163  ) (*WorkflowShadower, error) {
   164  	if len(domain) == 0 {
   165  		return nil, errors.New("domain is not set")
   166  	}
   167  
   168  	if err := shadowOptions.validateAndPopulateFields(); err != nil {
   169  		return nil, err
   170  	}
   171  
   172  	if shadowOptions.Concurrency > 1 {
   173  		return nil, errors.New("local workflow shadower doesn't support concurrency > 1")
   174  	}
   175  
   176  	if logger == nil {
   177  		logger = zap.NewNop()
   178  	}
   179  
   180  	return &WorkflowShadower{
   181  		service:       service,
   182  		domain:        domain,
   183  		shadowOptions: shadowOptions,
   184  		logger:        logger,
   185  		replayer:      NewWorkflowReplayerWithOptions(replayOptions),
   186  
   187  		status:     statusInitialized,
   188  		shutdownCh: make(chan struct{}),
   189  
   190  		clock: clock.New(),
   191  	}, nil
   192  }
   193  
   194  // RegisterWorkflow registers workflow function to replay
   195  func (s *WorkflowShadower) RegisterWorkflow(w interface{}) {
   196  	s.replayer.RegisterWorkflow(w)
   197  }
   198  
   199  // RegisterWorkflowWithOptions registers workflow function with custom workflow name to replay
   200  func (s *WorkflowShadower) RegisterWorkflowWithOptions(w interface{}, options RegisterWorkflowOptions) {
   201  	s.replayer.RegisterWorkflowWithOptions(w, options)
   202  }
   203  
   204  // Run starts WorkflowShadower in a blocking fashion
   205  func (s *WorkflowShadower) Run() error {
   206  	if !atomic.CompareAndSwapInt32(&s.status, statusInitialized, statusStarted) {
   207  		return errors.New("Workflow shadower already started")
   208  	}
   209  
   210  	return s.shadowWorker()
   211  }
   212  
   213  // Stop stops WorkflowShadower and wait up to one minute for all goroutines to finish before returning
   214  func (s *WorkflowShadower) Stop() {
   215  	if !atomic.CompareAndSwapInt32(&s.status, statusStarted, statusStopped) {
   216  		return
   217  	}
   218  
   219  	close(s.shutdownCh)
   220  
   221  	if success := util.AwaitWaitGroup(&s.shutdownWG, time.Minute); !success {
   222  		s.logger.Warn("Workflow Shadower timedout on shutdown")
   223  	}
   224  }
   225  
   226  func (s *WorkflowShadower) shadowWorker() error {
   227  	s.shutdownWG.Add(1)
   228  	defer s.shutdownWG.Done()
   229  
   230  	scanRequest := shadower.ScanWorkflowActivityParams{
   231  		Domain:        common.StringPtr(s.domain),
   232  		WorkflowQuery: common.StringPtr(s.shadowOptions.WorkflowQuery),
   233  		SamplingRate:  common.Float64Ptr(s.shadowOptions.SamplingRate),
   234  	}
   235  	s.logger.Info("Shadow workflow query",
   236  		zap.String(tagVisibilityQuery, s.shadowOptions.WorkflowQuery),
   237  	)
   238  
   239  	ctx := context.Background()
   240  	expirationTime := time.Unix(0, math.MaxInt64)
   241  	if s.shadowOptions.ExitCondition.ExpirationInterval != 0 {
   242  		expirationTime = s.clock.Now().Add(s.shadowOptions.ExitCondition.ExpirationInterval)
   243  	}
   244  
   245  	replayCount := 0
   246  	maxReplayCount := math.MaxInt64
   247  	if s.shadowOptions.ExitCondition.ShadowCount != 0 {
   248  		maxReplayCount = s.shadowOptions.ExitCondition.ShadowCount
   249  	}
   250  	rand.Seed(s.clock.Now().UnixNano())
   251  	for {
   252  		scanResult, err := scanWorkflowExecutionsHelper(ctx, s.service, scanRequest, s.logger)
   253  		if err != nil {
   254  			return err
   255  		}
   256  
   257  		for _, execution := range scanResult.Executions {
   258  			if s.clock.Now().After(expirationTime) {
   259  				return nil
   260  			}
   261  
   262  			success, err := replayWorkflowExecutionHelper(
   263  				ctx,
   264  				s.replayer,
   265  				s.service,
   266  				s.logger,
   267  				s.domain,
   268  				WorkflowExecution{
   269  					ID:    execution.GetWorkflowId(),
   270  					RunID: execution.GetRunId(),
   271  				},
   272  			)
   273  			if err != nil {
   274  				return err
   275  			}
   276  			if success {
   277  				replayCount++
   278  			}
   279  
   280  			if replayCount == maxReplayCount {
   281  				return nil
   282  			}
   283  		}
   284  
   285  		if len(scanResult.NextPageToken) == 0 {
   286  			if s.shadowOptions.Mode == ShadowModeNormal || s.clock.Now().Add(defaultWaitDurationPerIteration).After(expirationTime) {
   287  				return nil
   288  			}
   289  
   290  			s.clock.Sleep(defaultWaitDurationPerIteration)
   291  		}
   292  
   293  		scanRequest.NextPageToken = scanResult.NextPageToken
   294  	}
   295  
   296  }
   297  
   298  func (o *ShadowOptions) validateAndPopulateFields() error {
   299  	exitConditionSpecified := o.ExitCondition.ExpirationInterval > 0 || o.ExitCondition.ShadowCount > 0
   300  	if o.Mode == ShadowModeContinuous && !exitConditionSpecified {
   301  		return errors.New("exit condition must be specified if shadow mode is set to continuous")
   302  	}
   303  
   304  	if o.SamplingRate < 0 || o.SamplingRate > 1 {
   305  		return errors.New("sampling rate should be in range [0, 1]")
   306  	}
   307  
   308  	if len(o.WorkflowQuery) != 0 && (len(o.WorkflowTypes) != 0 || len(o.WorkflowStatus) != 0 || !o.WorkflowStartTimeFilter.isEmpty()) {
   309  		return errors.New("workflow types, status, start time and close time filter can't be specified when workflow query is specified")
   310  	}
   311  
   312  	if len(o.WorkflowQuery) == 0 {
   313  		queryBuilder := NewQueryBuilder().WorkflowTypes(o.WorkflowTypes)
   314  
   315  		statuses := make([]WorkflowStatus, 0, len(o.WorkflowStatus))
   316  		for _, statusString := range o.WorkflowStatus {
   317  			status, err := ToWorkflowStatus(statusString)
   318  			if err != nil {
   319  				return err
   320  			}
   321  			statuses = append(statuses, status)
   322  		}
   323  		//All the open statuses are taken by default. This list seems to not work as expected.
   324  		//TODO: verify that the status list works as expected. currently all wfs of all types get picked up.
   325  		if len(statuses) == 0 {
   326  			statuses = []WorkflowStatus{WorkflowStatusOpen}
   327  		}
   328  		queryBuilder.WorkflowStatus(statuses)
   329  
   330  		if !o.WorkflowStartTimeFilter.isEmpty() {
   331  			if err := o.WorkflowStartTimeFilter.validateAndPopulateFields(); err != nil {
   332  				return fmt.Errorf("invalid start time filter, error: %v", err)
   333  			}
   334  			queryBuilder.StartTime(o.WorkflowStartTimeFilter.MinTimestamp, o.WorkflowStartTimeFilter.MaxTimestamp)
   335  		}
   336  
   337  		if !o.WorkflowCloseTimeFilter.isEmpty() {
   338  			if err := o.WorkflowCloseTimeFilter.validateAndPopulateFields(); err != nil {
   339  				return fmt.Errorf("invalid close time filter, error: %v", err)
   340  			}
   341  			queryBuilder.CloseTime(o.WorkflowCloseTimeFilter.MinTimestamp, o.WorkflowCloseTimeFilter.MaxTimestamp)
   342  		}
   343  		o.WorkflowQuery = queryBuilder.Build()
   344  	}
   345  
   346  	if o.SamplingRate == 0 {
   347  		// if not set, defaults to replay all workflows
   348  		o.SamplingRate = 1
   349  	}
   350  
   351  	if o.Concurrency == 0 {
   352  		// if not set, defaults to 1
   353  		o.Concurrency = 1
   354  	}
   355  
   356  	return nil
   357  }
   358  
   359  func (t *TimeFilter) validateAndPopulateFields() error {
   360  	if t.MaxTimestamp.IsZero() {
   361  		t.MaxTimestamp = maxTimestamp
   362  	}
   363  
   364  	if t.MaxTimestamp.Before(t.MinTimestamp) {
   365  		return errors.New("maxTimestamp should be after minTimestamp")
   366  	}
   367  
   368  	return nil
   369  }
   370  
   371  func (t *TimeFilter) isEmpty() bool {
   372  	if t == nil {
   373  		return true
   374  	}
   375  
   376  	return t.MinTimestamp.IsZero() && t.MaxTimestamp.IsZero()
   377  }
   378  
   379  func (m ShadowMode) toThriftPtr() *shadower.Mode {
   380  	switch m {
   381  	case ShadowModeNormal:
   382  		return shadower.ModeNormal.Ptr()
   383  	case ShadowModeContinuous:
   384  		return shadower.ModeContinuous.Ptr()
   385  	default:
   386  		panic(fmt.Sprintf("unknown shadow mode %v", m))
   387  	}
   388  }
   389  
   390  func (e ShadowExitCondition) toThriftPtr() *shadower.ExitCondition {
   391  	return &shadower.ExitCondition{
   392  		ShadowCount:                 common.Int32Ptr(int32(e.ShadowCount)),
   393  		ExpirationIntervalInSeconds: common.Int32Ptr(int32(e.ExpirationInterval.Seconds())),
   394  	}
   395  }