go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/longop.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package impl 16 17 import ( 18 "context" 19 "fmt" 20 "sync/atomic" 21 "time" 22 23 "go.chromium.org/luci/common/clock" 24 "go.chromium.org/luci/common/errors" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/retry/transient" 27 "go.chromium.org/luci/gae/service/datastore" 28 "go.chromium.org/luci/server/tq" 29 30 bbfacade "go.chromium.org/luci/cv/internal/buildbucket/facade" 31 "go.chromium.org/luci/cv/internal/common" 32 "go.chromium.org/luci/cv/internal/run" 33 "go.chromium.org/luci/cv/internal/run/eventpb" 34 "go.chromium.org/luci/cv/internal/run/impl/longops" 35 ) 36 37 // enqueueLongOp enqueues long operations task and updates the given Run's long 38 // operations state. 39 func (rp *runProcessor) enqueueLongOps(ctx context.Context, r *run.Run, opIDs ...string) error { 40 for _, opID := range opIDs { 41 err := rp.tqDispatcher.AddTask(ctx, &tq.Task{ 42 Title: fmt.Sprintf("%s/%s/%T", r.ID, opID, r.OngoingLongOps.GetOps()[opID].GetWork()), 43 Payload: &eventpb.ManageRunLongOpTask{ 44 RunId: string(r.ID), 45 OperationId: opID, 46 }, 47 }) 48 if err != nil { 49 return err 50 } 51 } 52 return nil 53 } 54 55 func (rm *RunManager) doLongOperation(ctx context.Context, task *eventpb.ManageRunLongOpTask) error { 56 notifyCompleted := func(res *eventpb.LongOpCompleted) error { 57 if res.Status == eventpb.LongOpCompleted_LONG_OP_STATUS_UNSPECIFIED { 58 panic(fmt.Errorf("LongOpCompleted.Status must be set")) 59 } 60 res.OperationId = task.GetOperationId() 61 return rm.runNotifier.NotifyLongOpCompleted(ctx, common.RunID(task.GetRunId()), res) 62 } 63 64 r := &run.Run{ID: common.RunID(task.GetRunId())} 65 switch err := datastore.Get(ctx, r); { 66 case err == datastore.ErrNoSuchEntity: 67 // Highly unexpected. Fail hard. 68 return errors.Annotate(err, "Run %q not found", r.ID).Err() 69 case err != nil: 70 // Will retry. 71 return errors.Annotate(err, "failed to load Run %q", r.ID).Tag(transient.Tag).Err() 72 case r.OngoingLongOps.GetOps()[task.GetOperationId()] == nil: 73 // Highly unexpected. Fail hard. 74 return errors.Annotate(err, "Run %q has no outstanding long operation %q", r.ID, task.GetOperationId()).Err() 75 } 76 op := r.OngoingLongOps.GetOps()[task.GetOperationId()] 77 78 now := clock.Now(ctx) 79 d := op.GetDeadline().AsTime() 80 if d.Before(now) { 81 result := &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_EXPIRED} 82 if err := notifyCompleted(result); err != nil { 83 logging.Errorf(ctx, "Failed to NotifyLongOpCompleted: %s", err) 84 } 85 return errors.Reason("DoLongRunOperationTask arrived too late (deadline: %s, now %s)", d, now).Err() 86 } 87 88 checker := &longOpCancellationChecker{} 89 stop := checker.start(ctx, r, task.GetOperationId()) 90 defer stop() 91 92 opBase := longops.Base{ 93 Run: r, 94 Op: op, 95 IsCancelRequested: checker.check, 96 } 97 98 dctx, cancel := clock.WithDeadline(ctx, d) 99 defer cancel() 100 f := rm.doLongOperationWithDeadline 101 if rm.testDoLongOperationWithDeadline != nil { 102 f = rm.testDoLongOperationWithDeadline 103 } 104 result, err := f(dctx, &opBase) 105 106 switch { 107 case err == nil: 108 case transient.Tag.In(err): 109 // Just retry. 110 return err 111 case result != nil: 112 // honor the result status returned by the long op 113 case errors.Unwrap(err) == dctx.Err(): 114 logging.Warningf(ctx, "Failed long op due to hitting a deadline, setting result as EXPIRED") 115 result = &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_EXPIRED} 116 default: 117 // permanent failure 118 result = &eventpb.LongOpCompleted{Status: eventpb.LongOpCompleted_FAILED} 119 } 120 121 switch nerr := notifyCompleted(result); { 122 case nerr == nil: 123 return err 124 case err == nil: 125 // the op succeeded, but notify failed. 126 logging.Errorf(ctx, "Long op succeeded but it failed to notify Run Manager: %s", nerr) 127 return nerr 128 default: 129 // both op and notify failed. 130 logging.Errorf(ctx, "Long op %T permanently failed with %s, but also failed to notify Run Manager due to %s", op.GetWork(), err, nerr) 131 return err 132 } 133 } 134 135 func (rm *RunManager) doLongOperationWithDeadline(ctx context.Context, opBase *longops.Base) (*eventpb.LongOpCompleted, error) { 136 var op longops.Operation 137 switch w := opBase.Op.GetWork().(type) { 138 case *run.OngoingLongOps_Op_PostStartMessage: 139 op = &longops.PostStartMessageOp{ 140 Base: opBase, 141 Env: rm.env, 142 GFactory: rm.gFactory, 143 } 144 case *run.OngoingLongOps_Op_PostGerritMessage_: 145 op = &longops.PostGerritMessageOp{ 146 Base: opBase, 147 Env: rm.env, 148 GFactory: rm.gFactory, 149 } 150 case *run.OngoingLongOps_Op_ResetTriggers_: 151 op = &longops.ResetTriggersOp{ 152 Base: opBase, 153 GFactory: rm.gFactory, 154 CLMutator: rm.clMutator, 155 Concurrency: 8, 156 } 157 case *run.OngoingLongOps_Op_ExecuteTryjobs: 158 op = &longops.ExecuteTryjobsOp{ 159 Base: opBase, 160 Env: rm.env, 161 RunNotifier: rm.runNotifier, 162 Backend: &bbfacade.Facade{ 163 ClientFactory: rm.bbFactory, 164 }, 165 } 166 case *run.OngoingLongOps_Op_ExecutePostAction: 167 op = &longops.ExecutePostActionOp{ 168 Base: opBase, 169 GFactory: rm.gFactory, 170 RunNotifier: rm.runNotifier, 171 QM: rm.qm, 172 } 173 default: 174 logging.Errorf(ctx, "unknown LongOp work %T", w) 175 // Fail task quickly for backwards compatibility in case of a rollback during 176 // future deployment. 177 return nil, errors.Reason("Skipping %T", opBase.Op.GetWork()).Tag(tq.Fatal).Err() 178 } 179 return op.Do(ctx) 180 } 181 182 // longOpCancellationChecker asynchronously checks whether the given operation 183 // was requested to be cancelled by the Run Manager. 184 type longOpCancellationChecker struct { 185 // Options. 186 187 // interval controls how frequently the Datastore is checked. Defaults to 5s. 188 interval time.Duration 189 // testChan is used in tests to detect when background goroutine exists. 190 testChan chan struct{} 191 192 // Internal state. 193 194 // state is atomically updated int which stores state of the cancellation 195 // 196 // state is atomically updated int which stores state of the cancelation 197 // checker: 198 // * cancellationCheckerInitial 199 // * cancellationCheckerStarted 200 // * cancellationCheckerRequested 201 state int32 202 } 203 204 const ( 205 cancellationCheckerInitial = 0 206 cancellationCheckerStarted = 1 207 cancellationCheckerRequested = 2 208 ) 209 210 // check quickly and cheaply checks whether cancellation was requested. 211 // 212 // Does not block on anything. 213 func (l *longOpCancellationChecker) check() bool { 214 return atomic.LoadInt32(&l.state) == cancellationCheckerRequested 215 } 216 217 // start spawns a goroutine checking if cancellation was requested. 218 // 219 // Returns a stop function, which should be called to free resources. 220 func (l *longOpCancellationChecker) start(ctx context.Context, initial *run.Run, opID string) (stop func()) { 221 if !atomic.CompareAndSwapInt32(&l.state, cancellationCheckerInitial, cancellationCheckerStarted) { 222 panic(fmt.Errorf("start called more than once")) 223 } 224 switch { 225 case l.interval < 0: 226 panic(fmt.Errorf("negative interval %s", l.interval)) 227 case l.interval == 0: 228 l.interval = 5 * time.Second 229 case l.interval < time.Second: 230 // If lower frequency is desired in the future, use Redis directly (instead 231 // of indirectly via dscache). 232 panic(fmt.Errorf("too small interval %s -- don't hammer Datastore", l.interval)) 233 } 234 235 ctx, stop = context.WithCancel(ctx) 236 237 // Perform check on initial Run state immediately. 238 // This is useful if TQ task performing long op is retried s.t. the request 239 // for cancellation is detected immediately as opposed to during the next 240 // reload of the Run. 241 if !l.reevaluate(ctx, opID, initial, nil) { 242 return stop 243 } 244 245 go func() { 246 if l.testChan != nil { 247 defer close(l.testChan) 248 } 249 l.background(ctx, opID, initial.ID) 250 }() 251 252 return stop 253 } 254 255 func (l *longOpCancellationChecker) background(ctx context.Context, opID string, runID common.RunID) { 256 next := clock.Now(ctx) 257 for { 258 next = next.Add(l.interval) 259 left := next.Sub(clock.Now(ctx)) 260 if left > 0 { 261 if res := <-clock.After(ctx, left); res.Err != nil { 262 // Context got cancelled. 263 break 264 } 265 } 266 r := run.Run{ID: runID} 267 err := datastore.Get(ctx, &r) 268 if !l.reevaluate(ctx, opID, &r, err) { 269 break 270 } 271 } 272 } 273 274 // reevaluate updates state if necessary and returns whether the background 275 // checking should continue. 276 func (l *longOpCancellationChecker) reevaluate(ctx context.Context, opID string, r *run.Run, err error) bool { 277 switch { 278 case err == datastore.ErrNoSuchEntity: 279 logging.Errorf(ctx, "Run was unexpectedly deleted") 280 atomic.StoreInt32(&l.state, cancellationCheckerRequested) 281 return false 282 case err != nil && ctx.Err() != nil: 283 // Context was cancelled or expired. 284 return false 285 case err != nil: 286 logging.Warningf(ctx, "Failed to reload Run, will retry: %s", err) 287 return true 288 case r.OngoingLongOps == nil || r.OngoingLongOps.GetOps()[opID] == nil: 289 logging.Warningf(ctx, "Reloaded Run no longer has this operation") 290 atomic.StoreInt32(&l.state, cancellationCheckerRequested) 291 return false 292 case r.OngoingLongOps.GetOps()[opID].GetCancelRequested(): 293 logging.Warningf(ctx, "Cancellation request detected") 294 atomic.StoreInt32(&l.state, cancellationCheckerRequested) 295 return false 296 default: 297 return true 298 } 299 }