go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/longop.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package impl
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"go.chromium.org/luci/common/clock"
    24  	"go.chromium.org/luci/common/errors"
    25  	"go.chromium.org/luci/common/logging"
    26  	"go.chromium.org/luci/common/retry/transient"
    27  	"go.chromium.org/luci/gae/service/datastore"
    28  	"go.chromium.org/luci/server/tq"
    29  
    30  	bbfacade "go.chromium.org/luci/cv/internal/buildbucket/facade"
    31  	"go.chromium.org/luci/cv/internal/common"
    32  	"go.chromium.org/luci/cv/internal/run"
    33  	"go.chromium.org/luci/cv/internal/run/eventpb"
    34  	"go.chromium.org/luci/cv/internal/run/impl/longops"
    35  )
    36  
    37  // enqueueLongOp enqueues long operations task and updates the given Run's long
    38  // operations state.
    39  func (rp *runProcessor) enqueueLongOps(ctx context.Context, r *run.Run, opIDs ...string) error {
    40  	for _, opID := range opIDs {
    41  		err := rp.tqDispatcher.AddTask(ctx, &tq.Task{
    42  			Title: fmt.Sprintf("%s/%s/%T", r.ID, opID, r.OngoingLongOps.GetOps()[opID].GetWork()),
    43  			Payload: &eventpb.ManageRunLongOpTask{
    44  				RunId:       string(r.ID),
    45  				OperationId: opID,
    46  			},
    47  		})
    48  		if err != nil {
    49  			return err
    50  		}
    51  	}
    52  	return nil
    53  }
    54  
    55  func (rm *RunManager) doLongOperation(ctx context.Context, task *eventpb.ManageRunLongOpTask) error {
    56  	notifyCompleted := func(res *eventpb.LongOpCompleted) error {
    57  		if res.Status == eventpb.LongOpCompleted_LONG_OP_STATUS_UNSPECIFIED {
    58  			panic(fmt.Errorf("LongOpCompleted.Status must be set"))
    59  		}
    60  		res.OperationId = task.GetOperationId()
    61  		return rm.runNotifier.NotifyLongOpCompleted(ctx, common.RunID(task.GetRunId()), res)
    62  	}
    63  
    64  	r := &run.Run{ID: common.RunID(task.GetRunId())}
    65  	switch err := datastore.Get(ctx, r); {
    66  	case err == datastore.ErrNoSuchEntity:
    67  		// Highly unexpected. Fail hard.
    68  		return errors.Annotate(err, "Run %q not found", r.ID).Err()
    69  	case err != nil:
    70  		// Will retry.
    71  		return errors.Annotate(err, "failed to load Run %q", r.ID).Tag(transient.Tag).Err()
    72  	case r.OngoingLongOps.GetOps()[task.GetOperationId()] == nil:
    73  		// Highly unexpected. Fail hard.
    74  		return errors.Annotate(err, "Run %q has no outstanding long operation %q", r.ID, task.GetOperationId()).Err()
    75  	}
    76  	op := r.OngoingLongOps.GetOps()[task.GetOperationId()]
    77  
    78  	now := clock.Now(ctx)
    79  	d := op.GetDeadline().AsTime()
    80  	if d.Before(now) {
    81  		result := &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_EXPIRED}
    82  		if err := notifyCompleted(result); err != nil {
    83  			logging.Errorf(ctx, "Failed to NotifyLongOpCompleted: %s", err)
    84  		}
    85  		return errors.Reason("DoLongRunOperationTask arrived too late (deadline: %s, now %s)", d, now).Err()
    86  	}
    87  
    88  	checker := &longOpCancellationChecker{}
    89  	stop := checker.start(ctx, r, task.GetOperationId())
    90  	defer stop()
    91  
    92  	opBase := longops.Base{
    93  		Run:               r,
    94  		Op:                op,
    95  		IsCancelRequested: checker.check,
    96  	}
    97  
    98  	dctx, cancel := clock.WithDeadline(ctx, d)
    99  	defer cancel()
   100  	f := rm.doLongOperationWithDeadline
   101  	if rm.testDoLongOperationWithDeadline != nil {
   102  		f = rm.testDoLongOperationWithDeadline
   103  	}
   104  	result, err := f(dctx, &opBase)
   105  
   106  	switch {
   107  	case err == nil:
   108  	case transient.Tag.In(err):
   109  		// Just retry.
   110  		return err
   111  	case result != nil:
   112  		// honor the result status returned by the long op
   113  	case errors.Unwrap(err) == dctx.Err():
   114  		logging.Warningf(ctx, "Failed long op due to hitting a deadline, setting result as EXPIRED")
   115  		result = &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_EXPIRED}
   116  	default:
   117  		// permanent failure
   118  		result = &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_FAILED}
   119  	}
   120  
   121  	switch nerr := notifyCompleted(result); {
   122  	case nerr == nil:
   123  		return err
   124  	case err == nil:
   125  		// the op succeeded, but notify failed.
   126  		logging.Errorf(ctx, "Long op succeeded but it failed to notify Run Manager: %s", nerr)
   127  		return nerr
   128  	default:
   129  		// both op and notify failed.
   130  		logging.Errorf(ctx, "Long op %T permanently failed with %s, but also failed to notify Run Manager due to %s", op.GetWork(), err, nerr)
   131  		return err
   132  	}
   133  }
   134  
   135  func (rm *RunManager) doLongOperationWithDeadline(ctx context.Context, opBase *longops.Base) (*eventpb.LongOpCompleted, error) {
   136  	var op longops.Operation
   137  	switch w := opBase.Op.GetWork().(type) {
   138  	case *run.OngoingLongOps_Op_PostStartMessage:
   139  		op = &longops.PostStartMessageOp{
   140  			Base:     opBase,
   141  			Env:      rm.env,
   142  			GFactory: rm.gFactory,
   143  		}
   144  	case *run.OngoingLongOps_Op_PostGerritMessage_:
   145  		op = &longops.PostGerritMessageOp{
   146  			Base:     opBase,
   147  			Env:      rm.env,
   148  			GFactory: rm.gFactory,
   149  		}
   150  	case *run.OngoingLongOps_Op_ResetTriggers_:
   151  		op = &longops.ResetTriggersOp{
   152  			Base:        opBase,
   153  			GFactory:    rm.gFactory,
   154  			CLMutator:   rm.clMutator,
   155  			Concurrency: 8,
   156  		}
   157  	case *run.OngoingLongOps_Op_ExecuteTryjobs:
   158  		op = &longops.ExecuteTryjobsOp{
   159  			Base:        opBase,
   160  			Env:         rm.env,
   161  			RunNotifier: rm.runNotifier,
   162  			Backend: &bbfacade.Facade{
   163  				ClientFactory: rm.bbFactory,
   164  			},
   165  		}
   166  	case *run.OngoingLongOps_Op_ExecutePostAction:
   167  		op = &longops.ExecutePostActionOp{
   168  			Base:        opBase,
   169  			GFactory:    rm.gFactory,
   170  			RunNotifier: rm.runNotifier,
   171  			QM:          rm.qm,
   172  		}
   173  	default:
   174  		logging.Errorf(ctx, "unknown LongOp work %T", w)
   175  		// Fail task quickly for backwards compatibility in case of a rollback during
   176  		// future deployment.
   177  		return nil, errors.Reason("Skipping %T", opBase.Op.GetWork()).Tag(tq.Fatal).Err()
   178  	}
   179  	return op.Do(ctx)
   180  }
   181  
   182  // longOpCancellationChecker asynchronously checks whether the given operation
   183  // was requested to be cancelled by the Run Manager.
   184  type longOpCancellationChecker struct {
   185  	// Options.
   186  
   187  	// interval controls how frequently the Datastore is checked. Defaults to 5s.
   188  	interval time.Duration
   189  	// testChan is used in tests to detect when background goroutine exists.
   190  	testChan chan struct{}
   191  
   192  	// Internal state.
   193  
   194  	// state is atomically updated int which stores state of the cancellation
   195  	//
   196  	// state is atomically updated int which stores state of the cancelation
   197  	// checker:
   198  	//  * cancellationCheckerInitial
   199  	//  * cancellationCheckerStarted
   200  	//  * cancellationCheckerRequested
   201  	state int32
   202  }
   203  
   204  const (
   205  	cancellationCheckerInitial   = 0
   206  	cancellationCheckerStarted   = 1
   207  	cancellationCheckerRequested = 2
   208  )
   209  
   210  // check quickly and cheaply checks whether cancellation was requested.
   211  //
   212  // Does not block on anything.
   213  func (l *longOpCancellationChecker) check() bool {
   214  	return atomic.LoadInt32(&l.state) == cancellationCheckerRequested
   215  }
   216  
   217  // start spawns a goroutine checking if cancellation was requested.
   218  //
   219  // Returns a stop function, which should be called to free resources.
   220  func (l *longOpCancellationChecker) start(ctx context.Context, initial *run.Run, opID string) (stop func()) {
   221  	if !atomic.CompareAndSwapInt32(&l.state, cancellationCheckerInitial, cancellationCheckerStarted) {
   222  		panic(fmt.Errorf("start called more than once"))
   223  	}
   224  	switch {
   225  	case l.interval < 0:
   226  		panic(fmt.Errorf("negative interval %s", l.interval))
   227  	case l.interval == 0:
   228  		l.interval = 5 * time.Second
   229  	case l.interval < time.Second:
   230  		// If lower frequency is desired in the future, use Redis directly (instead
   231  		// of indirectly via dscache).
   232  		panic(fmt.Errorf("too small interval %s -- don't hammer Datastore", l.interval))
   233  	}
   234  
   235  	ctx, stop = context.WithCancel(ctx)
   236  
   237  	// Perform check on initial Run state immediately.
   238  	// This is useful if TQ task performing long op is retried s.t. the request
   239  	// for cancellation is detected immediately as opposed to during the next
   240  	// reload of the Run.
   241  	if !l.reevaluate(ctx, opID, initial, nil) {
   242  		return stop
   243  	}
   244  
   245  	go func() {
   246  		if l.testChan != nil {
   247  			defer close(l.testChan)
   248  		}
   249  		l.background(ctx, opID, initial.ID)
   250  	}()
   251  
   252  	return stop
   253  }
   254  
   255  func (l *longOpCancellationChecker) background(ctx context.Context, opID string, runID common.RunID) {
   256  	next := clock.Now(ctx)
   257  	for {
   258  		next = next.Add(l.interval)
   259  		left := next.Sub(clock.Now(ctx))
   260  		if left > 0 {
   261  			if res := <-clock.After(ctx, left); res.Err != nil {
   262  				// Context got cancelled.
   263  				break
   264  			}
   265  		}
   266  		r := run.Run{ID: runID}
   267  		err := datastore.Get(ctx, &r)
   268  		if !l.reevaluate(ctx, opID, &r, err) {
   269  			break
   270  		}
   271  	}
   272  }
   273  
   274  // reevaluate updates state if necessary and returns whether the background
   275  // checking should continue.
   276  func (l *longOpCancellationChecker) reevaluate(ctx context.Context, opID string, r *run.Run, err error) bool {
   277  	switch {
   278  	case err == datastore.ErrNoSuchEntity:
   279  		logging.Errorf(ctx, "Run was unexpectedly deleted")
   280  		atomic.StoreInt32(&l.state, cancellationCheckerRequested)
   281  		return false
   282  	case err != nil && ctx.Err() != nil:
   283  		// Context was cancelled or expired.
   284  		return false
   285  	case err != nil:
   286  		logging.Warningf(ctx, "Failed to reload Run, will retry: %s", err)
   287  		return true
   288  	case r.OngoingLongOps == nil || r.OngoingLongOps.GetOps()[opID] == nil:
   289  		logging.Warningf(ctx, "Reloaded Run no longer has this operation")
   290  		atomic.StoreInt32(&l.state, cancellationCheckerRequested)
   291  		return false
   292  	case r.OngoingLongOps.GetOps()[opID].GetCancelRequested():
   293  		logging.Warningf(ctx, "Cancellation request detected")
   294  		atomic.StoreInt32(&l.state, cancellationCheckerRequested)
   295  		return false
   296  	default:
   297  		return true
   298  	}
   299  }