go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/manager.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package impl 16 17 import ( 18 "context" 19 "fmt" 20 "strings" 21 "time" 22 23 "google.golang.org/protobuf/encoding/protojson" 24 "google.golang.org/protobuf/proto" 25 26 "go.chromium.org/luci/common/clock" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/logging" 29 "go.chromium.org/luci/common/retry/transient" 30 "go.chromium.org/luci/gae/service/datastore" 31 "go.chromium.org/luci/server/tq" 32 33 "go.chromium.org/luci/cv/internal/buildbucket" 34 "go.chromium.org/luci/cv/internal/changelist" 35 "go.chromium.org/luci/cv/internal/common" 36 "go.chromium.org/luci/cv/internal/common/bq" 37 "go.chromium.org/luci/cv/internal/common/eventbox" 38 "go.chromium.org/luci/cv/internal/common/lease" 39 "go.chromium.org/luci/cv/internal/common/tree" 40 "go.chromium.org/luci/cv/internal/gerrit" 41 "go.chromium.org/luci/cv/internal/gerrit/trigger" 42 "go.chromium.org/luci/cv/internal/prjmanager" 43 "go.chromium.org/luci/cv/internal/quota" 44 "go.chromium.org/luci/cv/internal/run" 45 runbq "go.chromium.org/luci/cv/internal/run/bq" 46 "go.chromium.org/luci/cv/internal/run/eventpb" 47 "go.chromium.org/luci/cv/internal/run/impl/handler" 48 "go.chromium.org/luci/cv/internal/run/impl/longops" 49 "go.chromium.org/luci/cv/internal/run/impl/state" 50 "go.chromium.org/luci/cv/internal/run/impl/submit" 51 "go.chromium.org/luci/cv/internal/run/pubsub" 52 "go.chromium.org/luci/cv/internal/run/rdb" 53 "go.chromium.org/luci/cv/internal/tryjob" 54 ) 55 56 // maxEventsPerBatch limits the number of incoming events that the RM will 57 // process at once. 58 // 59 // This shouldn't be hit in practice under normal operation. This is chosen 60 // such that RM can read these events and make some progress in 1 minute. 61 const maxEventsPerBatch = 10000 62 63 // RunManager manages Runs. 64 // 65 // It decides starting, cancellation, submission etc. for Runs. 66 type RunManager struct { 67 runNotifier *run.Notifier 68 pmNotifier *prjmanager.Notifier 69 clMutator *changelist.Mutator 70 tqDispatcher *tq.Dispatcher 71 env *common.Env 72 gFactory gerrit.Factory 73 bbFactory buildbucket.ClientFactory 74 qm *quota.Manager 75 handler handler.Handler 76 77 testDoLongOperationWithDeadline func(context.Context, *longops.Base) (*eventpb.LongOpCompleted, error) 78 } 79 80 // New constructs a new RunManager instance. 81 func New( 82 n *run.Notifier, 83 pm *prjmanager.Notifier, 84 tn *tryjob.Notifier, 85 clm *changelist.Mutator, 86 clu *changelist.Updater, 87 g gerrit.Factory, 88 bb buildbucket.ClientFactory, 89 tc tree.Client, 90 bqc bq.Client, 91 rdbf rdb.RecorderClientFactory, 92 qm *quota.Manager, 93 env *common.Env, 94 ) *RunManager { 95 rm := &RunManager{ 96 runNotifier: n, 97 pmNotifier: pm, 98 clMutator: clm, 99 tqDispatcher: n.TasksBinding.TQDispatcher, 100 env: env, 101 gFactory: g, 102 bbFactory: bb, 103 qm: qm, 104 handler: &handler.Impl{ 105 PM: pm, 106 RM: n, 107 TN: tn, 108 QM: qm, 109 CLUpdater: clu, 110 CLMutator: clm, 111 BQExporter: runbq.NewExporter(n.TasksBinding.TQDispatcher, bqc, env), 112 RdbNotifier: rdb.NewNotifier(n.TasksBinding.TQDispatcher, rdbf), 113 GFactory: g, 114 TreeClient: tc, 115 Publisher: pubsub.NewPublisher(n.TasksBinding.TQDispatcher, env), 116 Env: env, 117 }, 118 } 119 n.TasksBinding.ManageRun.AttachHandler( 120 func(ctx context.Context, payload proto.Message) error { 121 task := payload.(*eventpb.ManageRunTask) 122 ctx = logging.SetField(ctx, "run", task.GetRunId()) 123 err := rm.manageRun(ctx, common.RunID(task.GetRunId())) 124 // TODO(tandrii/yiwzhang): avoid retries iff we know a new task was 125 // already scheduled for the next second. 126 return common.TQIfy{ 127 KnownRetry: []error{ 128 submit.ErrTransientSubmissionFailure, 129 }, 130 KnownIgnoreTags: []errors.BoolTag{ 131 trigger.ErrResetPreconditionFailedTag, 132 common.DSContentionTag, 133 ignoreErrTag, 134 }, 135 }.Error(ctx, err) 136 }, 137 ) 138 n.TasksBinding.ManageRunLongOp.AttachHandler( 139 func(ctx context.Context, payload proto.Message) error { 140 task := payload.(*eventpb.ManageRunLongOpTask) 141 ctx = logging.SetFields(ctx, logging.Fields{ 142 "run": task.GetRunId(), 143 "operation": task.GetOperationId(), 144 }) 145 err := rm.doLongOperation(ctx, task) 146 return common.TQifyError(ctx, err) 147 }, 148 ) 149 return rm 150 } 151 152 var ignoreErrTag = errors.BoolTag{ 153 Key: errors.NewTagKey("intentionally ignored RM error"), 154 } 155 156 var pokeInterval = 5 * time.Minute 157 158 var fakeHandlerKey = "Fake Run Events Handler" 159 160 // manageRun periodically processes the events sent for a specific Run, thus 161 // moving the Run along its lifecycle. 162 func (rm *RunManager) manageRun(ctx context.Context, runID common.RunID) error { 163 proc := &runProcessor{ 164 runID: runID, 165 runNotifier: rm.runNotifier, 166 pmNotifier: rm.pmNotifier, 167 tqDispatcher: rm.tqDispatcher, 168 handler: rm.handler, 169 } 170 if h, ok := ctx.Value(&fakeHandlerKey).(handler.Handler); ok { 171 proc.handler = h 172 } 173 174 recipient := run.EventboxRecipient(ctx, runID) 175 postProcessFns, processErr := eventbox.ProcessBatch(ctx, recipient, proc, maxEventsPerBatch) 176 if processErr != nil { 177 if alreadyInLeaseErr, ok := lease.IsAlreadyInLeaseErr(processErr); ok { 178 expireTime := alreadyInLeaseErr.ExpireTime.UTC() 179 if err := rm.runNotifier.Invoke(ctx, runID, expireTime); err != nil { 180 return err 181 } 182 logging.Infof(ctx, "failed to acquire the lease for %q, revisit at %s", alreadyInLeaseErr.ResourceID, expireTime) 183 return ignoreErrTag.Apply(processErr) 184 } 185 if errors.Is(processErr, errRunMissing) { 186 logging.Errorf(ctx, "run %s is missing from datastore but got manage-run task. It's likely the run has been wiped out but something in LUCI CV is still scheduling tq task against the run.", runID) 187 return ignoreErrTag.Apply(processErr) 188 } 189 return processErr 190 } 191 192 for _, postProcessFn := range postProcessFns { 193 if err := postProcessFn(ctx); err != nil { 194 return err 195 } 196 } 197 return nil 198 } 199 200 // runProcessor implements eventbox.Processor. 201 type runProcessor struct { 202 runID common.RunID 203 204 runNotifier *run.Notifier 205 pmNotifier *prjmanager.Notifier 206 tqDispatcher *tq.Dispatcher 207 208 handler handler.Handler 209 } 210 211 var _ eventbox.Processor = (*runProcessor)(nil) 212 213 var errRunMissing = errors.New("requested run entity is missing") 214 215 // LoadState is called to load the state before a transaction. 216 func (rp *runProcessor) LoadState(ctx context.Context) (eventbox.State, eventbox.EVersion, error) { 217 r := run.Run{ID: rp.runID} 218 switch err := datastore.Get(ctx, &r); { 219 case err == datastore.ErrNoSuchEntity: 220 return nil, 0, errRunMissing 221 case err != nil: 222 return nil, 0, errors.Annotate(err, "failed to get Run %q", rp.runID).Tag(transient.Tag).Err() 223 } 224 rs := &state.RunState{Run: r} 225 return rs, eventbox.EVersion(r.EVersion), nil 226 } 227 228 // PrepareMutation is called before a transaction to compute transitions based on a 229 // batch of events. 230 // 231 // All actions that must be done atomically with updating state must be 232 // encapsulated inside Transition.SideEffectFn callback. 233 func (rp *runProcessor) PrepareMutation(ctx context.Context, events eventbox.Events, s eventbox.State) ([]eventbox.Transition, eventbox.Events, error) { 234 tr := &triageResult{} 235 var eventLog strings.Builder 236 eventLog.WriteString(fmt.Sprintf("Received %d events: ", len(events))) 237 for _, e := range events { 238 eventLog.WriteRune('\n') 239 eventLog.WriteString(" * ") 240 tr.triage(ctx, e, &eventLog) 241 } 242 logging.Debugf(ctx, eventLog.String()) 243 ts, err := rp.processTriageResults(ctx, tr, s.(*state.RunState)) 244 return ts, tr.garbage, err 245 } 246 247 // FetchEVersion is called at the beginning of a transaction. 248 // 249 // The returned EVersion is compared against the one associated with a state 250 // loaded via GetState. If different, the transaction is aborted and new state 251 // isn't saved. 252 func (rp *runProcessor) FetchEVersion(ctx context.Context) (eventbox.EVersion, error) { 253 r := &run.Run{ID: rp.runID} 254 if err := datastore.Get(ctx, r); err != nil { 255 return 0, errors.Annotate(err, "failed to get %q", rp.runID).Tag(transient.Tag).Err() 256 } 257 return eventbox.EVersion(r.EVersion), nil 258 } 259 260 // SaveState is called in a transaction to save the state if it has changed. 261 // 262 // The passed eversion is incremented value of eversion of what GetState 263 // returned before. 264 func (rp *runProcessor) SaveState(ctx context.Context, st eventbox.State, ev eventbox.EVersion) error { 265 rs := st.(*state.RunState) 266 r := rs.Run 267 r.EVersion = int64(ev) 268 r.UpdateTime = datastore.RoundTime(clock.Now(ctx).UTC()) 269 // Transactionally enqueue new long operations (if any). 270 if err := rp.enqueueLongOps(ctx, &rs.Run, rs.NewLongOpIDs...); err != nil { 271 return err 272 } 273 if err := datastore.Put(ctx, &r); err != nil { 274 return errors.Annotate(err, "failed to put Run %q", r.ID).Tag(transient.Tag).Err() 275 } 276 277 if len(rs.LogEntries) > 0 { 278 l := run.RunLog{ 279 ID: int64(ev), 280 Run: datastore.MakeKey(ctx, common.RunKind, string(r.ID)), 281 Entries: &run.LogEntries{Entries: rs.LogEntries}, 282 } 283 if err := datastore.Put(ctx, &l); err != nil { 284 return errors.Annotate(err, "failed to put RunLog %q", r.ID).Tag(transient.Tag).Err() 285 } 286 } 287 return nil 288 } 289 290 // triageResult is the result of the triage of the incoming events. 291 type triageResult struct { 292 startEvents eventbox.Events 293 cancelEvents struct { 294 events eventbox.Events 295 reasons []string 296 } 297 pokeEvents eventbox.Events 298 newConfigEvents struct { 299 events eventbox.Events 300 hash string 301 eversion int64 302 } 303 clUpdatedEvents struct { 304 events eventbox.Events 305 cls common.CLIDs 306 } 307 readyForSubmissionEvents eventbox.Events 308 clsSubmittedEvents struct { 309 events eventbox.Events 310 cls common.CLIDs 311 } 312 submissionCompletedEvent struct { 313 event eventbox.Event 314 sc *eventpb.SubmissionCompleted 315 } 316 longOpCompleted struct { 317 events eventbox.Events 318 results []*eventpb.LongOpCompleted 319 } 320 tryjobUpdatedEvents struct { 321 events eventbox.Events 322 tryjobs common.TryjobIDs 323 } 324 parentCompletedEvents eventbox.Events 325 nextReadyEventTime time.Time 326 // These events can be deleted even before the transaction starts. 327 garbage eventbox.Events 328 } 329 330 func (tr *triageResult) triage(ctx context.Context, item eventbox.Event, eventLog *strings.Builder) { 331 e := &eventpb.Event{} 332 if err := proto.Unmarshal(item.Value, e); err != nil { 333 // This is a bug in code or data corruption. 334 // There is no way to recover on its own. 335 logging.Errorf(ctx, "CRITICAL: failed to deserialize event %q: %s", item.ID, err) 336 panic(err) 337 } 338 eventLog.WriteString(fmt.Sprintf("%T: ", e.GetEvent())) 339 // Use compact JSON to make log short. 340 eventLog.WriteString((protojson.MarshalOptions{Multiline: false}).Format(e)) 341 if pa := e.GetProcessAfter().AsTime(); pa.After(clock.Now(ctx)) { 342 if tr.nextReadyEventTime.IsZero() || pa.Before(tr.nextReadyEventTime) { 343 tr.nextReadyEventTime = pa 344 } 345 return 346 } 347 switch e.GetEvent().(type) { 348 case *eventpb.Event_Start: 349 tr.startEvents = append(tr.startEvents, item) 350 case *eventpb.Event_Cancel: 351 tr.cancelEvents.events = append(tr.cancelEvents.events, item) 352 tr.cancelEvents.reasons = append(tr.cancelEvents.reasons, e.GetCancel().GetReason()) 353 case *eventpb.Event_Poke: 354 tr.pokeEvents = append(tr.pokeEvents, item) 355 case *eventpb.Event_NewConfig: 356 // Record all events but only the latest config hash. 357 tr.newConfigEvents.events = append(tr.newConfigEvents.events, item) 358 if ev := e.GetNewConfig().GetEversion(); ev > tr.newConfigEvents.eversion { 359 tr.newConfigEvents.eversion = ev 360 tr.newConfigEvents.hash = e.GetNewConfig().GetHash() 361 } 362 case *eventpb.Event_ClsUpdated: 363 tr.clUpdatedEvents.events = append(tr.clUpdatedEvents.events, item) 364 for _, one := range e.GetClsUpdated().GetEvents() { 365 tr.clUpdatedEvents.cls = append(tr.clUpdatedEvents.cls, common.CLID(one.GetClid())) 366 } 367 case *eventpb.Event_ReadyForSubmission: 368 tr.readyForSubmissionEvents = append(tr.readyForSubmissionEvents, item) 369 case *eventpb.Event_ClsSubmitted: 370 tr.clsSubmittedEvents.events = append(tr.clsSubmittedEvents.events, item) 371 for _, clid := range e.GetClsSubmitted().GetClids() { 372 tr.clsSubmittedEvents.cls = append(tr.clsSubmittedEvents.cls, common.CLID(clid)) 373 } 374 case *eventpb.Event_SubmissionCompleted: 375 existing, new := tr.submissionCompletedEvent.sc, e.GetSubmissionCompleted() 376 if existing != nil { 377 logging.Errorf(ctx, "crbug/1289448: multiple submission completed events received.\nexisting: %s\nnew: %s", existing, new) 378 } 379 if existing == nil || new.GetResult() > existing.GetResult() { 380 // Only override if the new result is worse than the existing one. 381 tr.submissionCompletedEvent.event = item 382 tr.submissionCompletedEvent.sc = new 383 } 384 case *eventpb.Event_LongOpCompleted: 385 tr.longOpCompleted.events = append(tr.longOpCompleted.events, item) 386 tr.longOpCompleted.results = append(tr.longOpCompleted.results, e.GetLongOpCompleted()) 387 case *eventpb.Event_TryjobsUpdated: 388 tr.tryjobUpdatedEvents.events = append(tr.tryjobUpdatedEvents.events, item) 389 for _, evt := range e.GetTryjobsUpdated().GetEvents() { 390 tr.tryjobUpdatedEvents.tryjobs = append(tr.tryjobUpdatedEvents.tryjobs, common.TryjobID(evt.GetTryjobId())) 391 } 392 case *eventpb.Event_ParentRunCompleted: 393 tr.parentCompletedEvents = append(tr.parentCompletedEvents, item) 394 default: 395 panic(fmt.Errorf("unknown event: %T [id=%q]", e.GetEvent(), item.ID)) 396 } 397 } 398 399 func (rp *runProcessor) processTriageResults(ctx context.Context, tr *triageResult, rs *state.RunState) ([]eventbox.Transition, error) { 400 statingState := rs 401 var transitions []eventbox.Transition 402 if len(tr.longOpCompleted.events) > 0 { 403 for i, opResult := range tr.longOpCompleted.results { 404 res, err := rp.handler.OnLongOpCompleted(ctx, rs, opResult) 405 if err != nil { 406 return nil, err 407 } 408 rs, transitions = applyResult(res, tr.longOpCompleted.events[i:i+1], transitions) 409 } 410 } 411 switch { 412 case len(tr.cancelEvents.events) > 0: 413 res, err := rp.handler.Cancel(ctx, rs, tr.cancelEvents.reasons) 414 if err != nil { 415 return nil, err 416 } 417 // Consume all the start events here as well because it is possible 418 // that Run Manager receives start and cancel events at the same time. 419 // For example, user requests to start a Run and immediately cancels 420 // it. But the duration is long enough for Project Manager to create 421 // this Run in CV. In that case, Run Manager should just move this Run 422 // to cancelled state directly. 423 events := append(tr.cancelEvents.events, tr.startEvents...) 424 rs, transitions = applyResult(res, events, transitions) 425 case len(tr.startEvents) > 0: 426 res, err := rp.handler.Start(ctx, rs) 427 if err != nil { 428 return nil, err 429 } 430 rs, transitions = applyResult(res, tr.startEvents, transitions) 431 } 432 433 if len(tr.newConfigEvents.events) > 0 { 434 res, err := rp.handler.UpdateConfig(ctx, rs, tr.newConfigEvents.hash) 435 if err != nil { 436 return nil, err 437 } 438 rs, transitions = applyResult(res, tr.newConfigEvents.events, transitions) 439 } 440 if len(tr.clUpdatedEvents.events) > 0 { 441 res, err := rp.handler.OnCLsUpdated(ctx, rs, tr.clUpdatedEvents.cls) 442 if err != nil { 443 return nil, err 444 } 445 rs, transitions = applyResult(res, tr.clUpdatedEvents.events, transitions) 446 } 447 if len(tr.tryjobUpdatedEvents.events) > 0 { 448 res, err := rp.handler.OnTryjobsUpdated(ctx, rs, tr.tryjobUpdatedEvents.tryjobs) 449 if err != nil { 450 return nil, err 451 } 452 rs, transitions = applyResult(res, tr.tryjobUpdatedEvents.events, transitions) 453 } 454 if len(tr.clsSubmittedEvents.events) > 0 { 455 res, err := rp.handler.OnCLsSubmitted(ctx, rs, tr.clsSubmittedEvents.cls) 456 if err != nil { 457 return nil, err 458 } 459 rs, transitions = applyResult(res, tr.clsSubmittedEvents.events, transitions) 460 } 461 if sc := tr.submissionCompletedEvent.sc; sc != nil { 462 res, err := rp.handler.OnSubmissionCompleted(ctx, rs, sc) 463 if err != nil { 464 return nil, err 465 } 466 rs, transitions = applyResult(res, eventbox.Events{tr.submissionCompletedEvent.event}, transitions) 467 } 468 if len(tr.readyForSubmissionEvents) > 0 { 469 res, err := rp.handler.OnReadyForSubmission(ctx, rs) 470 if err != nil { 471 return nil, err 472 } 473 rs, transitions = applyResult(res, tr.readyForSubmissionEvents, transitions) 474 } 475 if len(tr.pokeEvents) > 0 { 476 res, err := rp.handler.Poke(ctx, rs) 477 if err != nil { 478 return nil, err 479 } 480 rs, transitions = applyResult(res, tr.pokeEvents, transitions) 481 } 482 if len(tr.parentCompletedEvents) > 0 { 483 res, err := rp.handler.OnParentRunCompleted(ctx, rs) 484 if err != nil { 485 return nil, err 486 } 487 rs, transitions = applyResult(res, tr.parentCompletedEvents, transitions) 488 } 489 // Submission runs as PostProcessFn after event handling/state transition 490 // is done. It is possible that submission never reports the result back 491 // to RM (e.g. app crash in the middle, task timeout and etc.). In that 492 // case, when task retries, CV should try to resume the submission or fail 493 // the submission if deadline is exceeded even though no event was received. 494 // Therefore, always run TryResumeSubmission at the end regardless. 495 res, err := rp.handler.TryResumeSubmission(ctx, rs) 496 if err != nil { 497 return nil, err 498 } 499 _, transitions = applyResult(res, nil, transitions) 500 501 if err := rp.enqueueNextPoke(ctx, statingState.Run.Status, tr.nextReadyEventTime); err != nil { 502 return nil, err 503 } 504 return transitions, nil 505 } 506 507 func applyResult(res *handler.Result, events eventbox.Events, transitions []eventbox.Transition) (*state.RunState, []eventbox.Transition) { 508 t := eventbox.Transition{ 509 TransitionTo: res.State, 510 SideEffectFn: res.SideEffectFn, 511 PostProcessFn: res.PostProcessFn, 512 } 513 if !res.PreserveEvents { 514 t.Events = events 515 } 516 return res.State, append(transitions, t) 517 } 518 519 func (rp *runProcessor) enqueueNextPoke(ctx context.Context, startingStatus run.Status, nextReadyEventTime time.Time) error { 520 switch now := clock.Now(ctx); { 521 case run.IsEnded(startingStatus): 522 // Do not enqueue the next poke if the Run is ended at the beginning of 523 // the state transition. Not using the end state after the state 524 // transition here because CV may fail to save the state which may 525 // require the recursive poke to unblock the Run. 526 return nil 527 case nextReadyEventTime.IsZero(): 528 return rp.runNotifier.PokeAfter(ctx, rp.runID, pokeInterval) 529 case now.After(nextReadyEventTime): 530 // It is possible that by this time, next ready event is already 531 // overdue; invoke Run Manager immediately. 532 return rp.runNotifier.Invoke(ctx, rp.runID, time.Time{}) 533 case nextReadyEventTime.Before(now.Add(pokeInterval)): 534 return rp.runNotifier.Invoke(ctx, rp.runID, nextReadyEventTime) 535 default: 536 return rp.runNotifier.PokeAfter(ctx, rp.runID, pokeInterval) 537 } 538 }