go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/longop_test.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package impl 16 17 import ( 18 "context" 19 "testing" 20 "time" 21 22 "google.golang.org/protobuf/proto" 23 "google.golang.org/protobuf/types/known/timestamppb" 24 25 "go.chromium.org/luci/common/clock" 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/gae/service/datastore" 28 "go.chromium.org/luci/server/tq/tqtesting" 29 30 "go.chromium.org/luci/cv/internal/common/eventbox" 31 "go.chromium.org/luci/cv/internal/cvtesting" 32 "go.chromium.org/luci/cv/internal/run" 33 "go.chromium.org/luci/cv/internal/run/eventpb" 34 "go.chromium.org/luci/cv/internal/run/impl/longops" 35 "go.chromium.org/luci/cv/internal/run/impl/state" 36 "go.chromium.org/luci/cv/internal/run/runtest" 37 38 . "github.com/smartystreets/goconvey/convey" 39 "go.chromium.org/luci/common/retry/transient" 40 . "go.chromium.org/luci/common/testing/assertions" 41 ) 42 43 func TestLongOps(t *testing.T) { 44 t.Parallel() 45 46 Convey("Manager handles long ops", t, func() { 47 ct := cvtesting.Test{} 48 ctx, cancel := ct.SetUp(t) 49 defer cancel() 50 const runID = "chromium/222-1-deadbeef" 51 const initialEVersion = 10 52 So(datastore.Put(ctx, &run.Run{ 53 ID: runID, 54 Status: run.Status_RUNNING, 55 EVersion: initialEVersion, 56 }), ShouldBeNil) 57 58 loadRun := func(ctx context.Context) *run.Run { 59 ret := &run.Run{ID: runID} 60 So(datastore.Get(ctx, ret), ShouldBeNil) 61 return ret 62 } 63 64 notifier := run.NewNotifier(ct.TQDispatcher) 65 proc := &runProcessor{ 66 runID: runID, 67 runNotifier: notifier, 68 tqDispatcher: ct.TQDispatcher, 69 } 70 71 // Create a new request. 72 rs := &state.RunState{Run: *loadRun(ctx)} 73 opID := rs.EnqueueLongOp(&run.OngoingLongOps_Op{ 74 Deadline: timestamppb.New(ct.Clock.Now().Add(time.Minute)), 75 Work: &run.OngoingLongOps_Op_PostStartMessage{ 76 PostStartMessage: true, 77 }, 78 }) 79 So(rs.OngoingLongOps.GetOps(), ShouldHaveLength, 1) 80 81 // Simulate what happens when Run state is transactionally updated. 82 So(datastore.RunInTransaction(ctx, func(ctx context.Context) error { 83 return proc.SaveState(ctx, eventbox.State(rs), eventbox.EVersion(rs.EVersion+1)) 84 }, nil), ShouldBeNil) 85 86 // Verify that Run's state records new Long operation. 87 r := loadRun(ctx) 88 So(r.OngoingLongOps.GetOps(), ShouldHaveLength, 1) 89 op := r.OngoingLongOps.GetOps()[opID] 90 So(op, ShouldResembleProto, &run.OngoingLongOps_Op{ 91 CancelRequested: false, 92 Deadline: timestamppb.New(ct.Clock.Now().Add(time.Minute)), 93 Work: &run.OngoingLongOps_Op_PostStartMessage{ 94 PostStartMessage: true, 95 }, 96 }) 97 // Verify that long op task was enqueued. 98 So(ct.TQ.Tasks().Payloads(), ShouldResembleProto, []proto.Message{ 99 &eventpb.ManageRunLongOpTask{ 100 OperationId: opID, 101 RunId: runID, 102 }, 103 }) 104 105 Convey("manager handles Long Operation TQ task", func() { 106 manager := New(notifier, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, ct.Env) 107 108 Convey("OK", func() { 109 called := false 110 manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 111 called = true 112 d, ok := ctx.Deadline() 113 So(ok, ShouldBeTrue) 114 So(d.UTC(), ShouldResemble, op.GetDeadline().AsTime()) 115 return &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_SUCCEEDED}, nil 116 } 117 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 118 So(called, ShouldBeTrue) 119 runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{ 120 OperationId: opID, 121 Status: eventpb.LongOpCompleted_SUCCEEDED, 122 }) 123 }) 124 125 Convey("CancelRequested handling", func() { 126 rs := &state.RunState{Run: *loadRun(ctx)} 127 rs.RequestLongOpCancellation(opID) 128 rs.EVersion++ 129 So(datastore.Put(ctx, &rs.Run), ShouldBeNil) 130 131 called := false 132 manager.testDoLongOperationWithDeadline = func(ctx context.Context, opBase *longops.Base) (*eventpb.LongOpCompleted, error) { 133 called = true 134 So(opBase.IsCancelRequested(), ShouldBeTrue) 135 return &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_CANCELLED}, nil 136 } 137 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 138 So(called, ShouldBeTrue) 139 runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{ 140 OperationId: opID, 141 Status: eventpb.LongOpCompleted_CANCELLED, 142 }) 143 }) 144 145 Convey("Expired long op must not be executed, but Run Manager should be notified", func() { 146 ct.Clock.Add(time.Hour) 147 called := false 148 manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 149 called = true 150 return &eventpb.LongOpCompleted{}, nil 151 } 152 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 153 So(called, ShouldBeFalse) 154 runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{ 155 OperationId: opID, 156 Status: eventpb.LongOpCompleted_EXPIRED, 157 }) 158 }) 159 160 Convey("Expired while executing", func() { 161 called := false 162 manager.testDoLongOperationWithDeadline = func(dctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 163 called = true 164 ct.Clock.Add(time.Hour) // expire the `dctx` 165 // NOTE: it's unclear why the following sometimes fails: 166 for dctx.Err() == nil { 167 clock.Sleep(dctx, time.Second) 168 } 169 So(dctx.Err(), ShouldNotBeNil) 170 return nil, errors.Annotate(dctx.Err(), "somehow treating as permanent failure").Err() 171 } 172 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 173 So(called, ShouldBeTrue) 174 runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{ 175 OperationId: opID, 176 Status: eventpb.LongOpCompleted_EXPIRED, 177 }) 178 }) 179 180 Convey("Transient failure is retried", func() { 181 called := 0 182 manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 183 called++ 184 if called == 1 { 185 return nil, errors.New("troops", transient.Tag) 186 } 187 return &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_SUCCEEDED}, nil 188 } 189 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 190 So(called, ShouldEqual, 2) 191 runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{ 192 OperationId: opID, 193 Status: eventpb.LongOpCompleted_SUCCEEDED, 194 }) 195 }) 196 197 Convey("Non-transient failure is fatal", func() { 198 called := 0 199 manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 200 called++ 201 if called == 1 { 202 return nil, errors.New("foops") 203 } 204 return &eventpb.LongOpCompleted{}, nil 205 } 206 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 207 So(called, ShouldEqual, 1) 208 runtest.AssertReceivedLongOpCompleted(ctx, runID, &eventpb.LongOpCompleted{ 209 OperationId: opID, 210 Status: eventpb.LongOpCompleted_FAILED, 211 }) 212 }) 213 214 Convey("Doesn't execute in weird cases", func() { 215 Convey("Run deleted", func() { 216 So(datastore.Delete(ctx, &run.Run{ID: runID}), ShouldBeNil) 217 called := false 218 manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 219 called = true 220 return &eventpb.LongOpCompleted{}, nil 221 } 222 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 223 So(called, ShouldBeFalse) 224 runtest.AssertEventboxEmpty(ctx, runID) 225 }) 226 Convey("Long op is no longer known", func() { 227 r := loadRun(ctx) 228 r.OngoingLongOps = nil 229 So(datastore.Put(ctx, r), ShouldBeNil) 230 called := false 231 manager.testDoLongOperationWithDeadline = func(ctx context.Context, _ *longops.Base) (*eventpb.LongOpCompleted, error) { 232 called = true 233 return &eventpb.LongOpCompleted{}, nil 234 } 235 ct.TQ.Run(ctx, tqtesting.StopAfterTask(eventpb.ManageRunLongOpTaskClass)) 236 So(called, ShouldBeFalse) 237 runtest.AssertEventboxEmpty(ctx, runID) 238 }) 239 }) 240 }) 241 }) 242 } 243 244 func TestLongOpCancellationChecker(t *testing.T) { 245 t.Parallel() 246 247 Convey("longOpCancellationChecker works", t, func() { 248 ct := cvtesting.Test{} 249 ctx, cancel := ct.SetUp(t) 250 defer cancel() 251 const runID = "chromium/222-1-deadbeef" 252 const opID = "op-1" 253 254 So(datastore.Put(ctx, &run.Run{ 255 ID: runID, 256 Status: run.Status_RUNNING, 257 EVersion: 1, 258 OngoingLongOps: &run.OngoingLongOps{ 259 Ops: map[string]*run.OngoingLongOps_Op{ 260 opID: { 261 CancelRequested: false, // changed in tests below 262 // Other fields aren't relevant to this test. 263 }, 264 }, 265 }, 266 }), ShouldBeNil) 267 268 loadRun := func() *run.Run { 269 ret := &run.Run{ID: runID} 270 So(datastore.Get(ctx, ret), ShouldBeNil) 271 return ret 272 } 273 274 ct.Clock.SetTimerCallback(func(dur time.Duration, _ clock.Timer) { 275 // Whenever background goroutine sleeps, awake it immediately. 276 ct.Clock.Add(dur) 277 }) 278 279 done := make(chan struct{}) 280 281 assertDone := func() { 282 select { 283 case <-done: 284 case <-ctx.Done(): 285 So("context expired before background goroutine was done", ShouldBeFalse) 286 } 287 } 288 assertNotDone := func() { 289 select { 290 case <-done: 291 So("background goroutine is done", ShouldBeFalse) 292 default: 293 } 294 } 295 296 l := longOpCancellationChecker{ 297 interval: time.Second, 298 testChan: done, 299 } 300 301 Convey("Normal operation without long op cancellation", func() { 302 stop := l.start(ctx, loadRun(), opID) 303 defer stop() 304 So(l.check(), ShouldBeFalse) 305 ct.Clock.Add(time.Minute) 306 So(l.check(), ShouldBeFalse) 307 assertNotDone() 308 }) 309 310 Convey("Initial Run state with cancellation request is noticed immediately", func() { 311 r := loadRun() 312 r.OngoingLongOps.GetOps()[opID].CancelRequested = true 313 So(datastore.Put(ctx, r), ShouldBeNil) 314 315 stop := l.start(ctx, loadRun(), opID) 316 defer stop() 317 318 So(l.check(), ShouldBeTrue) 319 // Background goroutine shouldn't even be started, hence the `done` 320 // channel should remain open. 321 assertNotDone() 322 }) 323 324 Convey("Notices cancellation request eventually", func() { 325 stop := l.start(ctx, loadRun(), opID) 326 defer stop() 327 328 // Store request to cancel. 329 r := loadRun() 330 r.OngoingLongOps.GetOps()[opID].CancelRequested = true 331 So(datastore.Put(ctx, r), ShouldBeNil) 332 333 ct.Clock.Add(time.Minute) 334 // Must be done soon. 335 assertDone() 336 // Now, the cancellation request must be noticed. 337 So(l.check(), ShouldBeTrue) 338 }) 339 340 Convey("Robust in case of edge cases which should not happen in practice", func() { 341 Convey("Notices Run losing track of this long operation", func() { 342 stop := l.start(ctx, loadRun(), opID) 343 defer stop() 344 345 r := loadRun() 346 r.OngoingLongOps = nil 347 So(datastore.Put(ctx, r), ShouldBeNil) 348 349 ct.Clock.Add(time.Minute) 350 // Must be done soon. 351 assertDone() 352 // Treat it as if the long op was requested to be cancelled. 353 So(l.check(), ShouldBeTrue) 354 }) 355 356 Convey("Notices Run deletion", func() { 357 stop := l.start(ctx, loadRun(), opID) 358 defer stop() 359 360 So(datastore.Delete(ctx, loadRun()), ShouldBeNil) 361 362 ct.Clock.Add(time.Minute) 363 // Must be done soon. 364 assertDone() 365 // Treat it as if the long op was requested to be cancelled. 366 So(l.check(), ShouldBeTrue) 367 }) 368 }) 369 370 Convey("Background goroutine lifetime is bounded", func() { 371 Convey("by calling stop()", func() { 372 stop := l.start(ctx, loadRun(), opID) 373 assertNotDone() 374 stop() 375 assertDone() 376 So(l.check(), ShouldBeFalse) // the long op is still not cancelled 377 }) 378 379 Convey("by context", func() { 380 Convey("when context expires", func() { 381 innerCtx, ctxCancel := clock.WithTimeout(ctx, time.Minute) 382 defer ctxCancel() // to cleanup test resources, not actually relevant to the test 383 stop := l.start(innerCtx, loadRun(), opID) 384 defer stop() 385 ct.Clock.Add(time.Hour) // expire the innerCtx 386 assertDone() 387 So(l.check(), ShouldBeFalse) // the long op is still not cancelled 388 }) 389 390 Convey("context is cancelled", func() { 391 innerCtx, ctxCancel := clock.WithTimeout(ctx, time.Minute) 392 stop := l.start(innerCtx, loadRun(), opID) 393 defer stop() 394 ctxCancel() 395 assertDone() 396 So(l.check(), ShouldBeFalse) // the long op is still not cancelled 397 }) 398 }) 399 }) 400 }) 401 }