go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/longop_test.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package impl
    16  
    17  import (
    18  	"context"
    19  	"testing"
    20  	"time"
    21  
    22  	"google.golang.org/protobuf/proto"
    23  	"google.golang.org/protobuf/types/known/timestamppb"
    24  
    25  	"go.chromium.org/luci/common/clock"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/gae/service/datastore"
    28  	"go.chromium.org/luci/server/tq/tqtesting"
    29  
    30  	"go.chromium.org/luci/cv/internal/common/eventbox"
    31  	"go.chromium.org/luci/cv/internal/cvtesting"
    32  	"go.chromium.org/luci/cv/internal/run"
    33  	"go.chromium.org/luci/cv/internal/run/eventpb"
    34  	"go.chromium.org/luci/cv/internal/run/impl/longops"
    35  	"go.chromium.org/luci/cv/internal/run/impl/state"
    36  	"go.chromium.org/luci/cv/internal/run/runtest"
    37  
    38  	. "github.com/smartystreets/goconvey/convey"
    39  	"go.chromium.org/luci/common/retry/transient"
    40  	. "go.chromium.org/luci/common/testing/assertions"
    41  )
    42  
    43  func TestLongOps(t *testing.T) {
    44  	t.Parallel()
    45  
    46  	Convey("Manager handles long ops", t, func() {
    47  		ct := cvtesting.Test{}
    48  		ctx, cancel := ct.SetUp(t)
    49  		defer cancel()
    50  		const runID = "chromium/222-1-deadbeef"
    51  		const initialEVersion = 10
    52  		So(datastore.Put(ctx, &run.Run{
    53  			ID:       runID,
    54  			Status:   run.Status_RUNNING,
    55  			EVersion: initialEVersion,
    56  		}), ShouldBeNil)
    57  
    58  		loadRun := func(ctx context.Context) *run.Run {
    59  			ret := &run.Run{ID: runID}
    60  			So(datastore.Get(ctx, ret), ShouldBeNil)
    61  			return ret
    62  		}
    63  
    64  		notifier := run.NewNotifier(ct.TQDispatcher)
    65  		proc := &runProcessor{
    66  			runID:        runID,
    67  			runNotifier:  notifier,
    68  			tqDispatcher: ct.TQDispatcher,
    69  		}
    70  
    71  		// Create a new request.
    72  		rs := &state.RunState{Run: *loadRun(ctx)}
    73  		opID := rs.EnqueueLongOp(&run.OngoingLongOps_Op{
    74  			Deadline: timestamppb.New(ct.Clock.Now().Add(time.Minute)),
    75  			Work: &run.OngoingLongOps_Op_PostStartMessage{
    76  				PostStartMessage: true,
    77  			},
    78  		})
    79  		So(rs.OngoingLongOps.GetOps(), ShouldHaveLength, 1)
    80  
    81  		// Simulate what happens when Run state is transactionally updated.
    82  		So(datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    83  			return proc.SaveState(ctx, eventbox.State(rs), eventbox.EVersion(rs.EVersion+1))
    84  		}, nil), ShouldBeNil)
    85  
    86  		// Verify that Run's state records new Long operation.
    87  		r := loadRun(ctx)
    88  		So(r.OngoingLongOps.GetOps(), ShouldHaveLength, 1)
    89  		op := r.OngoingLongOps.GetOps()[opID]
    90  		So(op, ShouldResembleProto, &run.OngoingLongOps_Op{
    91  			CancelRequested: false,
    92  			Deadline:        timestamppb.New(ct.Clock.Now().Add(time.Minute)),
    93  			Work: &run.OngoingLongOps_Op_PostStartMessage{
    94  				PostStartMessage: true,
    95  			},
    96  		})
    97  		// Verify that long op task was enqueued.
    98  		So(ct.TQ.Tasks().Payloads(), ShouldResembleProto, []proto.Message{
    99  			&eventpb.ManageRunLongOpTask{
   100  				OperationId: opID,
   101  				RunId:       runID,
   102  			},
   103  		})
   104  
   105  		Convey("manager handles Long Operation TQ task", func() {
   106  			manager := New(notifier, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, ct.Env)
   107  
   108  			Convey("OK", func() {
   109  				called := false
   110  				manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   111  					called = true
   112  					d, ok := ctx.Deadline()
   113  					So(ok, ShouldBeTrue)
   114  					So(d.UTC(), ShouldResemble, op.GetDeadline().AsTime())
   115  					return &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_SUCCEEDED}, nil
   116  				}
   117  				ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   118  				So(called, ShouldBeTrue)
   119  				runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{
   120  					OperationId: opID,
   121  					Status:      eventpb.LongOpCompleted_SUCCEEDED,
   122  				})
   123  			})
   124  
   125  			Convey("CancelRequested handling", func() {
   126  				rs := &state.RunState{Run: *loadRun(ctx)}
   127  				rs.RequestLongOpCancellation(opID)
   128  				rs.EVersion++
   129  				So(datastore.Put(ctx, &rs.Run), ShouldBeNil)
   130  
   131  				called := false
   132  				manager.testDoLongOperationWithDeadline = func(ctx context.Context, opBase *longops.Base) (*eventpb.LongOpCompleted, error) {
   133  					called = true
   134  					So(opBase.IsCancelRequested(), ShouldBeTrue)
   135  					return &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_CANCELLED}, nil
   136  				}
   137  				ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   138  				So(called, ShouldBeTrue)
   139  				runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{
   140  					OperationId: opID,
   141  					Status:      eventpb.LongOpCompleted_CANCELLED,
   142  				})
   143  			})
   144  
   145  			Convey("Expired long op must not be executed, but Run Manager should be notified", func() {
   146  				ct.Clock.Add(time.Hour)
   147  				called := false
   148  				manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   149  					called = true
   150  					return &eventpb.LongOpCompleted{}, nil
   151  				}
   152  				ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   153  				So(called, ShouldBeFalse)
   154  				runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{
   155  					OperationId: opID,
   156  					Status:      eventpb.LongOpCompleted_EXPIRED,
   157  				})
   158  			})
   159  
   160  			Convey("Expired while executing", func() {
   161  				called := false
   162  				manager.testDoLongOperationWithDeadline = func(dctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   163  					called = true
   164  					ct.Clock.Add(time.Hour) // expire the `dctx`
   165  					// NOTE: it's unclear why the following sometimes fails:
   166  					for dctx.Err() == nil {
   167  						clock.Sleep(dctx, time.Second)
   168  					}
   169  					So(dctx.Err(), ShouldNotBeNil)
   170  					return nil, errors.Annotate(dctx.Err(), "somehow treating as permanent failure").Err()
   171  				}
   172  				ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   173  				So(called, ShouldBeTrue)
   174  				runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{
   175  					OperationId: opID,
   176  					Status:      eventpb.LongOpCompleted_EXPIRED,
   177  				})
   178  			})
   179  
   180  			Convey("Transient failure is retried", func() {
   181  				called := 0
   182  				manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   183  					called++
   184  					if called == 1 {
   185  						return nil, errors.New("troops", transient.Tag)
   186  					}
   187  					return &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_SUCCEEDED}, nil
   188  				}
   189  				ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   190  				So(called, ShouldEqual, 2)
   191  				runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{
   192  					OperationId: opID,
   193  					Status:      eventpb.LongOpCompleted_SUCCEEDED,
   194  				})
   195  			})
   196  
   197  			Convey("Non-transient failure is fatal", func() {
   198  				called := 0
   199  				manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   200  					called++
   201  					if called == 1 {
   202  						return nil, errors.New("foops")
   203  					}
   204  					return &eventpb.LongOpCompleted{}, nil
   205  				}
   206  				ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   207  				So(called, ShouldEqual, 1)
   208  				runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{
   209  					OperationId: opID,
   210  					Status:      eventpb.LongOpCompleted_FAILED,
   211  				})
   212  			})
   213  
   214  			Convey("Doesn't execute in weird cases", func() {
   215  				Convey("Run deleted", func() {
   216  					So(datastore.Delete(ctx, &run.Run{ID: runID}), ShouldBeNil)
   217  					called := false
   218  					manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   219  						called = true
   220  						return &eventpb.LongOpCompleted{}, nil
   221  					}
   222  					ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   223  					So(called, ShouldBeFalse)
   224  					runtest.AssertEventboxEmpty(ctx, runID)
   225  				})
   226  				Convey("Long op is no longer known", func() {
   227  					r := loadRun(ctx)
   228  					r.OngoingLongOps = nil
   229  					So(datastore.Put(ctx, r), ShouldBeNil)
   230  					called := false
   231  					manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) {
   232  						called = true
   233  						return &eventpb.LongOpCompleted{}, nil
   234  					}
   235  					ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass))
   236  					So(called, ShouldBeFalse)
   237  					runtest.AssertEventboxEmpty(ctx, runID)
   238  				})
   239  			})
   240  		})
   241  	})
   242  }
   243  
   244  func TestLongOpCancellationChecker(t *testing.T) {
   245  	t.Parallel()
   246  
   247  	Convey("longOpCancellationChecker works", t, func() {
   248  		ct := cvtesting.Test{}
   249  		ctx, cancel := ct.SetUp(t)
   250  		defer cancel()
   251  		const runID = "chromium/222-1-deadbeef"
   252  		const opID = "op-1"
   253  
   254  		So(datastore.Put(ctx, &run.Run{
   255  			ID:       runID,
   256  			Status:   run.Status_RUNNING,
   257  			EVersion: 1,
   258  			OngoingLongOps: &run.OngoingLongOps{
   259  				Ops: map[string]*run.OngoingLongOps_Op{
   260  					opID: {
   261  						CancelRequested: false, // changed in tests below
   262  						// Other fields aren't relevant to this test.
   263  					},
   264  				},
   265  			},
   266  		}), ShouldBeNil)
   267  
   268  		loadRun := func() *run.Run {
   269  			ret := &run.Run{ID: runID}
   270  			So(datastore.Get(ctx, ret), ShouldBeNil)
   271  			return ret
   272  		}
   273  
   274  		ct.Clock.SetTimerCallback(func(dur time.Duration, _ clock.Timer) {
   275  			// Whenever background goroutine sleeps, awake it immediately.
   276  			ct.Clock.Add(dur)
   277  		})
   278  
   279  		done := make(chan struct{})
   280  
   281  		assertDone := func() {
   282  			select {
   283  			case <-done:
   284  			case <-ctx.Done():
   285  				So("context expired before background goroutine was done", ShouldBeFalse)
   286  			}
   287  		}
   288  		assertNotDone := func() {
   289  			select {
   290  			case <-done:
   291  				So("background goroutine is done", ShouldBeFalse)
   292  			default:
   293  			}
   294  		}
   295  
   296  		l := longOpCancellationChecker{
   297  			interval: time.Second,
   298  			testChan: done,
   299  		}
   300  
   301  		Convey("Normal operation without long op cancellation", func() {
   302  			stop := l.start(ctx, loadRun(), opID)
   303  			defer stop()
   304  			So(l.check(), ShouldBeFalse)
   305  			ct.Clock.Add(time.Minute)
   306  			So(l.check(), ShouldBeFalse)
   307  			assertNotDone()
   308  		})
   309  
   310  		Convey("Initial Run state with cancellation request is noticed immediately", func() {
   311  			r := loadRun()
   312  			r.OngoingLongOps.GetOps()[opID].CancelRequested = true
   313  			So(datastore.Put(ctx, r), ShouldBeNil)
   314  
   315  			stop := l.start(ctx, loadRun(), opID)
   316  			defer stop()
   317  
   318  			So(l.check(), ShouldBeTrue)
   319  			// Background goroutine shouldn't even be started, hence the `done`
   320  			// channel should remain open.
   321  			assertNotDone()
   322  		})
   323  
   324  		Convey("Notices cancellation request eventually", func() {
   325  			stop := l.start(ctx, loadRun(), opID)
   326  			defer stop()
   327  
   328  			// Store request to cancel.
   329  			r := loadRun()
   330  			r.OngoingLongOps.GetOps()[opID].CancelRequested = true
   331  			So(datastore.Put(ctx, r), ShouldBeNil)
   332  
   333  			ct.Clock.Add(time.Minute)
   334  			// Must be done soon.
   335  			assertDone()
   336  			// Now, the cancellation request must be noticed.
   337  			So(l.check(), ShouldBeTrue)
   338  		})
   339  
   340  		Convey("Robust in case of edge cases which should not happen in practice", func() {
   341  			Convey("Notices Run losing track of this long operation", func() {
   342  				stop := l.start(ctx, loadRun(), opID)
   343  				defer stop()
   344  
   345  				r := loadRun()
   346  				r.OngoingLongOps = nil
   347  				So(datastore.Put(ctx, r), ShouldBeNil)
   348  
   349  				ct.Clock.Add(time.Minute)
   350  				// Must be done soon.
   351  				assertDone()
   352  				// Treat it as if the long op was requested to be cancelled.
   353  				So(l.check(), ShouldBeTrue)
   354  			})
   355  
   356  			Convey("Notices Run deletion", func() {
   357  				stop := l.start(ctx, loadRun(), opID)
   358  				defer stop()
   359  
   360  				So(datastore.Delete(ctx, loadRun()), ShouldBeNil)
   361  
   362  				ct.Clock.Add(time.Minute)
   363  				// Must be done soon.
   364  				assertDone()
   365  				// Treat it as if the long op was requested to be cancelled.
   366  				So(l.check(), ShouldBeTrue)
   367  			})
   368  		})
   369  
   370  		Convey("Background goroutine lifetime is bounded", func() {
   371  			Convey("by calling stop()", func() {
   372  				stop := l.start(ctx, loadRun(), opID)
   373  				assertNotDone()
   374  				stop()
   375  				assertDone()
   376  				So(l.check(), ShouldBeFalse) // the long op is still not cancelled
   377  			})
   378  
   379  			Convey("by context", func() {
   380  				Convey("when context expires", func() {
   381  					innerCtx, ctxCancel := clock.WithTimeout(ctx, time.Minute)
   382  					defer ctxCancel() // to cleanup test resources, not actually relevant to the test
   383  					stop := l.start(innerCtx, loadRun(), opID)
   384  					defer stop()
   385  					ct.Clock.Add(time.Hour) // expire the innerCtx
   386  					assertDone()
   387  					So(l.check(), ShouldBeFalse) // the long op is still not cancelled
   388  				})
   389  
   390  				Convey("context is cancelled", func() {
   391  					innerCtx, ctxCancel := clock.WithTimeout(ctx, time.Minute)
   392  					stop := l.start(innerCtx, loadRun(), opID)
   393  					defer stop()
   394  					ctxCancel()
   395  					assertDone()
   396  					So(l.check(), ShouldBeFalse) // the long op is still not cancelled
   397  				})
   398  			})
   399  		})
   400  	})
   401  }