go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/submit.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package handler 16 17 import ( 18 "context" 19 "fmt" 20 "sort" 21 "strconv" 22 "strings" 23 "time" 24 25 "google.golang.org/protobuf/proto" 26 "google.golang.org/protobuf/types/known/timestamppb" 27 28 "go.chromium.org/luci/common/clock" 29 "go.chromium.org/luci/common/errors" 30 "go.chromium.org/luci/common/logging" 31 gerritpb "go.chromium.org/luci/common/proto/gerrit" 32 "go.chromium.org/luci/common/retry/transient" 33 "go.chromium.org/luci/common/sync/parallel" 34 "go.chromium.org/luci/gae/service/datastore" 35 "go.chromium.org/luci/server/tq" 36 37 cfgpb "go.chromium.org/luci/cv/api/config/v2" 38 39 "go.chromium.org/luci/cv/internal/common" 40 "go.chromium.org/luci/cv/internal/common/eventbox" 41 "go.chromium.org/luci/cv/internal/configs/prjcfg" 42 "go.chromium.org/luci/cv/internal/gerrit" 43 "go.chromium.org/luci/cv/internal/run" 44 "go.chromium.org/luci/cv/internal/run/eventpb" 45 "go.chromium.org/luci/cv/internal/run/impl/state" 46 "go.chromium.org/luci/cv/internal/run/impl/submit" 47 "go.chromium.org/luci/cv/internal/run/impl/util" 48 ) 49 50 const submissionPostponedMessage = "Posting Submittion Postponed Message" 51 52 // OnReadyForSubmission implements Handler interface. 53 func (impl *Impl) OnReadyForSubmission(ctx context.Context, rs *state.RunState) (*Result, error) { 54 switch status := rs.Status; { 55 case run.IsEnded(status): 56 // It is safe to discard this event because this event arrives after Run 57 // gets cancelled while waiting for submission. 58 logging.Debugf(ctx, "received ReadyForSubmission event when Run is %s", status) 59 rs = rs.ShallowCopy() 60 // Under certain race conditions, this Run may still occupy the submit 61 // queue. So, check first without a transaction and then initiate a 62 // transaction to release if this Run currently occupies the submit queue. 63 if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil { 64 return nil, err 65 } 66 return &Result{State: rs}, nil 67 case status == run.Status_SUBMITTING: 68 // Discard this event if this Run is currently submitting. If submission 69 // is stopped and should be resumed (e.g. transient failure, app crashing), 70 // it should be handled in `OnSubmissionCompleted` or `TryResumeSubmission`. 71 logging.Debugf(ctx, "received ReadyForSubmission event when Run is submitting") 72 return &Result{State: rs}, nil 73 case status == run.Status_RUNNING: 74 // This may happen when this Run transitioned from RUNNING status to 75 // WAITING_FOR_SUBMISSION, prepared for submission but failed to 76 // save the state transition. This Run is receiving this event because 77 // of the fail-safe task sent while acquiring the Submit Queue. CV should 78 // treat this Run as WAITING_FOR_SUBMISSION status. 79 rs = rs.ShallowCopy() 80 rs.Status = run.Status_WAITING_FOR_SUBMISSION 81 case status != run.Status_WAITING_FOR_SUBMISSION: 82 panic(fmt.Errorf("impossible status %s", status)) 83 } 84 // Check DepRuns to see if we can start submitting this run. 85 if len(rs.DepRuns) > 0 { 86 runs, errs := run.LoadRunsFromIDs(rs.DepRuns...).Do(ctx) 87 for i, depr := range runs { 88 switch err := errs[i]; { 89 case err == datastore.ErrNoSuchEntity: 90 panic(fmt.Errorf("run %s has a non-existent DepRuns Run %s", rs.ID, depr.ID)) 91 case err != nil: 92 return nil, errors.Annotate(err, "failed to load run %s", depr.ID).Tag(transient.Tag).Err() 93 case !run.IsEnded(depr.Status): 94 // If a parent run has not finished we cannot submit this run. 95 rs = rs.ShallowCopy() 96 runCLs, err := run.LoadRunCLs(ctx, depr.ID, depr.CLs) 97 if err != nil { 98 return nil, err 99 } 100 for _, runCL := range runCLs { 101 rs.LogInfof(ctx, submissionPostponedMessage, "Submission postponed, waiting for upstream CL %s to be submitted", runCL.ExternalID.MustURL()) 102 } 103 return &Result{State: rs}, nil 104 case depr.Status != run.Status_SUCCEEDED: 105 // If a parent run has ended but was not successful, this run should not be submitted and 106 // the parent run will trigger OnParentRunCompleted to handle cancelling this run. 107 return &Result{State: rs}, nil 108 } 109 } 110 } 111 112 if !run.ShouldSubmit(&rs.Run) { 113 panic(fmt.Errorf("impossible, %s runs cannot submit CLs", rs.Mode)) 114 } 115 if len(rs.Submission.GetSubmittedCls()) > 0 { 116 panic(fmt.Errorf("impossible; Run %q is in Status_WAITING_FOR_SUBMISSION status but has submitted CLs ", rs.ID)) 117 } 118 rs = rs.ShallowCopy() 119 cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID) 120 if err != nil { 121 return nil, err 122 } 123 switch waitlisted, err := acquireSubmitQueue(ctx, rs, impl.RM, cg.SubmitOptions); { 124 case err != nil: 125 return nil, err 126 case waitlisted: 127 // This Run will be notified by Submit Queue once its turn comes. 128 return &Result{State: rs}, nil 129 } 130 rs.CloneSubmission() 131 switch treeOpen, treeErr := rs.CheckTree(ctx, impl.TreeClient); { 132 case treeErr != nil && clock.Since(ctx, rs.Submission.TreeErrorSince.AsTime()) > treeStatusFailureTimeLimit: 133 // Failed to fetch status for a long time. Fail the Run. 134 if err := releaseSubmitQueue(ctx, rs, impl.RM); err != nil { 135 return nil, err 136 } 137 rims := make(map[common.CLID]reviewInputMeta, len(rs.CLs)) 138 whoms := rs.Mode.GerritNotifyTargets() 139 meta := reviewInputMeta{ 140 notify: whoms, 141 // Add the same set of group/people to the attention set. 142 addToAttention: whoms, 143 reason: treeStatusCheckFailedReason, 144 message: fmt.Sprintf(persistentTreeStatusAppFailureTemplate, cg.Content.GetVerifiers().GetTreeStatus().GetUrl()), 145 } 146 for _, cl := range rs.CLs { 147 if !rs.HasRootCL() || rs.RootCL == cl { 148 rims[cl] = meta 149 } 150 } 151 scheduleTriggersReset(ctx, rs, rims, run.Status_FAILED) 152 return &Result{ 153 State: rs, 154 }, nil 155 case treeErr != nil: 156 logging.Warningf(ctx, "tree-status check failed with: %s, retrying in %s", treeErr, treeCheckInterval) 157 fallthrough 158 case !treeOpen: 159 err := parallel.WorkPool(2, func(work chan<- func() error) { 160 work <- func() error { 161 // Tree is closed or status unknown, revisit after 1 minute. 162 return impl.RM.PokeAfter(ctx, rs.ID, treeCheckInterval) 163 } 164 work <- func() error { 165 // Give up the Submit Queue while waiting for tree to open. 166 return releaseSubmitQueue(ctx, rs, impl.RM) 167 } 168 }) 169 if err != nil { 170 return nil, common.MostSevereError(err) 171 } 172 return &Result{State: rs}, nil 173 default: 174 if err := markSubmitting(ctx, rs); err != nil { 175 return nil, err 176 } 177 s := submit.NewSubmitter(ctx, rs.ID, rs.Submission, impl.RM, impl.GFactory) 178 rs.SubmissionScheduled = true 179 return &Result{ 180 State: rs, 181 PostProcessFn: s.Submit, 182 }, nil 183 } 184 } 185 186 // OnCLsSubmitted implements Handler interface. 187 func (*Impl) OnCLsSubmitted(ctx context.Context, rs *state.RunState, clids common.CLIDs) (*Result, error) { 188 switch status := rs.Status; { 189 case run.IsEnded(status): 190 logging.Warningf(ctx, "received CLsSubmitted event when Run is %s", status) 191 return &Result{State: rs}, nil 192 case status != run.Status_SUBMITTING: 193 return nil, errors.Reason("expected SUBMITTING status; got %s", status).Err() 194 } 195 rs = rs.ShallowCopy() 196 rs.Submission = proto.Clone(rs.Submission).(*run.Submission) 197 submitted := clids.Set() 198 for _, cl := range rs.Submission.GetSubmittedCls() { 199 submitted.AddI64(cl) 200 } 201 if rs.Submission.GetSubmittedCls() != nil { 202 rs.Submission.SubmittedCls = rs.Submission.SubmittedCls[:0] 203 } 204 for _, cl := range rs.Submission.GetCls() { 205 if submitted.HasI64(cl) { 206 rs.Submission.SubmittedCls = append(rs.Submission.SubmittedCls, cl) 207 submitted.DelI64(cl) 208 } 209 } 210 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 211 Time: timestamppb.New(clock.Now(ctx)), 212 Kind: &run.LogEntry_ClSubmitted{ 213 ClSubmitted: &run.LogEntry_CLSubmitted{ 214 NewlySubmittedCls: common.CLIDsAsInt64s(clids), 215 TotalSubmitted: int64(len(rs.Submission.SubmittedCls)), 216 }, 217 }, 218 }) 219 if len(submitted) > 0 { 220 unexpected := make(sort.IntSlice, 0, len(submitted)) 221 for clid := range submitted { 222 unexpected = append(unexpected, int(clid)) 223 } 224 unexpected.Sort() 225 return nil, errors.Reason("received CLsSubmitted event for cls not belonging to this Run: %v", unexpected).Err() 226 } 227 return &Result{State: rs}, nil 228 } 229 230 // OnSubmissionCompleted implements Handler interface. 231 func (impl *Impl) OnSubmissionCompleted(ctx context.Context, rs *state.RunState, sc *eventpb.SubmissionCompleted) (*Result, error) { 232 switch status := rs.Status; { 233 case run.IsEnded(status): 234 logging.Warningf(ctx, "received SubmissionCompleted event when Run is %s", status) 235 rs = rs.ShallowCopy() 236 if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil { 237 return nil, err 238 } 239 return &Result{State: rs}, nil 240 case status != run.Status_SUBMITTING: 241 return nil, errors.Reason("expected SUBMITTING status; got %s", status).Err() 242 } 243 244 rs = rs.ShallowCopy() 245 if sc.GetQueueReleaseTimestamp() != nil { 246 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 247 Time: sc.GetQueueReleaseTimestamp(), 248 Kind: &run.LogEntry_ReleasedSubmitQueue_{ 249 ReleasedSubmitQueue: &run.LogEntry_ReleasedSubmitQueue{}, 250 }, 251 }) 252 } 253 254 cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID) 255 if err != nil { 256 return nil, errors.Annotate(err, "prjcfg.GetConfigGroup").Err() 257 } 258 switch sc.GetResult() { 259 case eventpb.SubmissionResult_SUCCEEDED: 260 childRuns, err := run.LoadChildRuns(ctx, rs.ID) 261 if err != nil { 262 return nil, errors.Annotate(err, "failed to load child runs").Err() 263 } 264 se := impl.endRun(ctx, rs, run.Status_SUCCEEDED, cg, childRuns) 265 // Only mark run submitted to ResultDB when submission run is successful. 266 se = eventbox.Chain( 267 se, 268 func(ctx context.Context) error { 269 return impl.RdbNotifier.Schedule(ctx, rs.ID) 270 }, 271 ) 272 return &Result{ 273 State: rs, 274 SideEffectFn: se, 275 }, nil 276 case eventpb.SubmissionResult_FAILED_TRANSIENT: 277 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 278 Time: timestamppb.New(clock.Now(ctx)), 279 Kind: &run.LogEntry_SubmissionFailure_{ 280 SubmissionFailure: &run.LogEntry_SubmissionFailure{ 281 Event: sc, 282 }, 283 }, 284 }) 285 return impl.tryResumeSubmission(ctx, rs, sc) 286 case eventpb.SubmissionResult_FAILED_PERMANENT: 287 if clFailures := sc.GetClFailures(); clFailures != nil { 288 failedCLs := make([]int64, len(clFailures.GetFailures())) 289 for i, f := range clFailures.GetFailures() { 290 failedCLs[i] = f.GetClid() 291 } 292 rs.Submission = proto.Clone(rs.Submission).(*run.Submission) 293 rs.Submission.FailedCls = failedCLs 294 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 295 Time: timestamppb.New(clock.Now(ctx)), 296 Kind: &run.LogEntry_SubmissionFailure_{ 297 SubmissionFailure: &run.LogEntry_SubmissionFailure{ 298 Event: sc, 299 }, 300 }, 301 }) 302 } 303 allRunCLs, err := run.LoadRunCLs(ctx, rs.ID, common.MakeCLIDs(rs.Submission.GetCls()...)) 304 if err != nil { 305 return nil, err 306 } 307 impl.scheduleResetTriggersForNotSubmittedCLs(ctx, rs, allRunCLs, sc) 308 return &Result{State: rs}, nil 309 default: 310 panic(fmt.Errorf("impossible submission result %s", sc.GetResult())) 311 } 312 } 313 314 // TryResumeSubmission implements Handler interface. 315 func (impl *Impl) TryResumeSubmission(ctx context.Context, rs *state.RunState) (*Result, error) { 316 return impl.tryResumeSubmission(ctx, rs, nil) 317 } 318 319 func (impl *Impl) tryResumeSubmission(ctx context.Context, rs *state.RunState, sc *eventpb.SubmissionCompleted) (*Result, error) { 320 switch { 321 case rs.Status != run.Status_SUBMITTING || rs.SubmissionScheduled: 322 return &Result{State: rs}, nil 323 case sc != nil && sc.Result != eventpb.SubmissionResult_FAILED_TRANSIENT: 324 panic(fmt.Errorf("submission can only be resumed on nil submission completed event or event reporting transient failure; got %s", sc)) 325 } 326 327 deadline := rs.Submission.GetDeadline() 328 taskID := rs.Submission.GetTaskId() 329 switch { 330 case deadline == nil: 331 panic(fmt.Errorf("impossible: run %q is submitting but Run.Submission.Deadline is not set", rs.ID)) 332 case taskID == "": 333 panic(fmt.Errorf("impossible: run %q is submitting but Run.Submission.TaskId is not set", rs.ID)) 334 } 335 336 switch expired := clock.Now(ctx).After(deadline.AsTime()); { 337 case expired: 338 cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID) 339 if err != nil { 340 return nil, errors.Annotate(err, "prjcfg.GetConfigGroup").Err() 341 } 342 343 rs = rs.ShallowCopy() 344 switch submittedCnt := len(rs.Submission.GetSubmittedCls()); { 345 case submittedCnt > 0 && submittedCnt == len(rs.Submission.GetCls()): 346 // Fully submitted 347 if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil { 348 return nil, err 349 } 350 childRuns, err := run.LoadChildRuns(ctx, rs.ID) 351 if err != nil { 352 return nil, errors.Annotate(err, "failed to load child runs").Err() 353 } 354 return &Result{ 355 State: rs, 356 SideEffectFn: impl.endRun(ctx, rs, run.Status_SUCCEEDED, cg, childRuns), 357 }, nil 358 default: // None submitted or partially submitted 359 // Synthesize a submission completed event with permanent failure. 360 if clFailures := sc.GetClFailures(); clFailures != nil { 361 rs.Submission = proto.Clone(rs.Submission).(*run.Submission) 362 rs.Submission.FailedCls = make([]int64, len(clFailures.GetFailures())) 363 sc = &eventpb.SubmissionCompleted{ 364 Result: eventpb.SubmissionResult_FAILED_PERMANENT, 365 FailureReason: &eventpb.SubmissionCompleted_ClFailures{ 366 ClFailures: &eventpb.SubmissionCompleted_CLSubmissionFailures{ 367 Failures: make([]*eventpb.SubmissionCompleted_CLSubmissionFailure, len(clFailures.GetFailures())), 368 }, 369 }, 370 } 371 for i, f := range clFailures.GetFailures() { 372 rs.Submission.FailedCls[i] = f.GetClid() 373 sc.GetClFailures().Failures[i] = &eventpb.SubmissionCompleted_CLSubmissionFailure{ 374 Clid: f.GetClid(), 375 Message: fmt.Sprintf("CL failed to submit because of transient failure: %s. However, submission is running out of time to retry.", f.GetMessage()), 376 } 377 } 378 } else { 379 sc = &eventpb.SubmissionCompleted{ 380 Result: eventpb.SubmissionResult_FAILED_PERMANENT, 381 FailureReason: &eventpb.SubmissionCompleted_Timeout{ 382 Timeout: true, 383 }, 384 } 385 } 386 allRunCLs, err := run.LoadRunCLs(ctx, rs.ID, common.MakeCLIDs(rs.Submission.GetCls()...)) 387 if err != nil { 388 return nil, err 389 } 390 impl.scheduleResetTriggersForNotSubmittedCLs(ctx, rs, allRunCLs, sc) 391 if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil { 392 return nil, err 393 } 394 return &Result{State: rs}, nil 395 } 396 case taskID == mustTaskIDFromContext(ctx): 397 // Matching taskID indicates current task is the retry of a previous 398 // submitting task that has failed transiently. Continue the submission. 399 rs = rs.ShallowCopy() 400 s := submit.NewSubmitter(ctx, rs.ID, rs.Submission, impl.RM, impl.GFactory) 401 rs.SubmissionScheduled = true 402 return &Result{ 403 State: rs, 404 PostProcessFn: s.Submit, 405 }, nil 406 default: 407 // Presumably another task is working on the submission at this time. 408 // So, wake up RM as soon as the submission expires. Meanwhile, don't 409 // consume the event as the retries of submission task will process 410 // this event. It's probably a race condition that this task sees this 411 // event first. 412 if err := impl.RM.Invoke(ctx, rs.ID, deadline.AsTime()); err != nil { 413 return nil, err 414 } 415 return &Result{ 416 State: rs, 417 PreserveEvents: true, 418 }, nil 419 } 420 } 421 422 func acquireSubmitQueue(ctx context.Context, rs *state.RunState, rm RM, opts *cfgpb.SubmitOptions) (waitlisted bool, err error) { 423 now := clock.Now(ctx).UTC() 424 var innerErr error 425 err = datastore.RunInTransaction(ctx, func(ctx context.Context) error { 426 waitlisted, innerErr = submit.TryAcquire(ctx, rm.NotifyReadyForSubmission, rs.ID, opts) 427 switch { 428 case innerErr != nil: 429 return innerErr 430 case !waitlisted: 431 // It is possible that RM fails before successfully completing the state 432 // transition. In that case, this Run will block Submit Queue infinitely. 433 // Sending a ReadyForSubmission event after 10s as a fail-safe to ensure 434 // Run keeps making progress. 435 return rm.NotifyReadyForSubmission(ctx, rs.ID, now.Add(10*time.Second)) 436 default: 437 return nil 438 } 439 }, nil) 440 switch { 441 case innerErr != nil: 442 return false, innerErr 443 case err != nil: 444 return false, errors.Annotate(err, "failed to run the transaction to acquire submit queue").Tag(transient.Tag).Err() 445 case waitlisted: 446 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 447 Time: timestamppb.New(clock.Now(ctx)), 448 Kind: &run.LogEntry_Waitlisted_{ 449 Waitlisted: &run.LogEntry_Waitlisted{}, 450 }, 451 }) 452 logging.Debugf(ctx, "Waitlisted in Submit Queue") 453 return true, nil 454 default: 455 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 456 Time: timestamppb.New(clock.Now(ctx)), 457 Kind: &run.LogEntry_AcquiredSubmitQueue_{ 458 AcquiredSubmitQueue: &run.LogEntry_AcquiredSubmitQueue{}, 459 }, 460 }) 461 logging.Debugf(ctx, "Acquired Submit Queue") 462 return false, nil 463 } 464 } 465 466 // releaseSubmitQueueIfTaken checks if submit queue is occupied by the given 467 // Run before trying to release. 468 func releaseSubmitQueueIfTaken(ctx context.Context, rs *state.RunState, rm RM) error { 469 switch current, waitlist, err := submit.LoadCurrentAndWaitlist(ctx, rs.ID); { 470 case err != nil: 471 return err 472 case current == rs.ID: 473 return releaseSubmitQueue(ctx, rs, rm) 474 default: 475 for _, w := range waitlist { 476 if w == rs.ID { 477 return releaseSubmitQueue(ctx, rs, rm) 478 } 479 } 480 } 481 return nil 482 } 483 484 func releaseSubmitQueue(ctx context.Context, rs *state.RunState, rm RM) error { 485 var innerErr error 486 err := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 487 innerErr = submit.Release(ctx, rm.NotifyReadyForSubmission, rs.ID) 488 return innerErr 489 }, nil) 490 switch { 491 case innerErr != nil: 492 return innerErr 493 case err != nil: 494 return errors.Annotate(err, "failed to release submit queue").Tag(transient.Tag).Err() 495 } 496 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 497 Time: timestamppb.New(clock.Now(ctx)), 498 Kind: &run.LogEntry_ReleasedSubmitQueue_{ 499 ReleasedSubmitQueue: &run.LogEntry_ReleasedSubmitQueue{}, 500 }, 501 }) 502 logging.Debugf(ctx, "Released Submit Queue") 503 return nil 504 } 505 506 const defaultSubmissionDuration = 20 * time.Minute 507 508 func markSubmitting(ctx context.Context, rs *state.RunState) error { 509 rs.Status = run.Status_SUBMITTING 510 cls, err := run.LoadRunCLs(ctx, rs.ID, rs.CLs) 511 if err != nil { 512 return err 513 } 514 if rs.Submission.Cls, err = orderCLsInSubmissionOrder(ctx, cls, rs.ID, rs.Submission); err != nil { 515 return err 516 } 517 // TODO(b/267966142): Find a less hacky way to set the timeout for CLs 518 // that are known to slow to land. 519 submissionDuration := defaultSubmissionDuration 520 if hasCrOSKernelChange(rs.ID.LUCIProject(), cls) { 521 submissionDuration = 40 * time.Minute 522 } 523 rs.Submission.Deadline = timestamppb.New(clock.Now(ctx).UTC().Add(submissionDuration)) 524 rs.Submission.TaskId = mustTaskIDFromContext(ctx) 525 return nil 526 } 527 528 func (impl *Impl) scheduleResetTriggersForNotSubmittedCLs(ctx context.Context, rs *state.RunState, allRunCLs []*run.RunCL, sc *eventpb.SubmissionCompleted) { 529 whoms := rs.Mode.GerritNotifyTargets() 530 // Single-CL Run 531 if len(allRunCLs) == 1 { 532 meta := reviewInputMeta{ 533 notify: whoms, 534 addToAttention: whoms, 535 reason: submissionFailureAttentionReason, 536 } 537 switch { 538 case sc.GetClFailures() != nil: 539 failures := sc.GetClFailures().GetFailures() 540 if len(failures) != 1 { 541 panic(fmt.Errorf("expected exactly 1 failed CL, got %v", failures)) 542 } 543 meta.message = failures[0].GetMessage() 544 case sc.GetTimeout(): 545 meta.message = timeoutMsg 546 default: 547 meta.message = defaultMsg 548 } 549 scheduleTriggersReset(ctx, rs, map[common.CLID]reviewInputMeta{ 550 allRunCLs[0].ID: meta, 551 }, run.Status_FAILED) 552 return 553 } 554 555 // Multi-CL Run 556 submitted, failed, pending := splitRunCLs(allRunCLs, rs.Submission, sc) 557 msgSuffix := makeSubmissionMsgSuffix(submitted, failed, pending) 558 // Multi-CL Run with root CL (i.e. instantly triggered multi-CL Run) 559 if rs.HasRootCL() { 560 meta := reviewInputMeta{ 561 notify: whoms, 562 addToAttention: whoms, 563 reason: submissionFailureAttentionReason, 564 } 565 switch clFailures := sc.GetClFailures().GetFailures(); { 566 case len(clFailures) == 1 && clFailures[0].GetClid() == int64(rs.RootCL): 567 // this is the case where the only failed CL is the root CL. 568 meta.message = fmt.Sprintf("%s\n\n%s", sc.GetClFailures().GetFailures()[0].GetMessage(), msgSuffix) 569 case len(clFailures) > 0: 570 // Summarize a list of the failed CLs and post it onto the root CL. 571 var sb strings.Builder 572 sb.WriteString(partiallySubmittedMsgForRootCL) 573 failureMsgByCLID := map[common.CLID]string{} 574 for _, f := range sc.GetClFailures().GetFailures() { 575 failureMsgByCLID[common.CLID(f.GetClid())] = f.GetMessage() 576 } 577 for _, f := range failed { 578 fmt.Fprintf(&sb, "\n* %s: %s", f.ExternalID.MustURL(), failureMsgByCLID[f.ID]) 579 } 580 fmt.Fprint(&sb, "\n\n") 581 fmt.Fprint(&sb, msgSuffix) 582 meta.message = sb.String() 583 case sc.GetTimeout(): 584 meta.message = fmt.Sprintf("%s\n\n%s", timeoutMsg, msgSuffix) 585 default: 586 meta.message = fmt.Sprintf("%s\n\n%s", defaultMsg, msgSuffix) 587 } 588 scheduleTriggersReset(ctx, rs, map[common.CLID]reviewInputMeta{ 589 rs.RootCL: meta, 590 }, run.Status_FAILED) 591 return 592 } 593 // Multi-CL Run without root CL 594 metas := make(map[common.CLID]reviewInputMeta, len(failed)+len(pending)) 595 switch { 596 case sc.GetClFailures() != nil: 597 // Compute metas for CLs that fail to submit 598 for _, f := range sc.GetClFailures().GetFailures() { 599 metas[common.CLID(f.GetClid())] = reviewInputMeta{ 600 notify: whoms, 601 message: fmt.Sprintf("%s\n\n%s", f.GetMessage(), msgSuffix), 602 addToAttention: whoms, 603 reason: submissionFailureAttentionReason, 604 } 605 } 606 607 // Compute the metas for CLs that CV didn't attempt to submit. 608 var sb strings.Builder 609 // TODO(yiwzhang): Once CV learns how to submit multiple CLs in parallel, 610 // this should be optimized to print out failed CLs that each pending CL 611 // depends on instead of printing out all failed CLs. 612 // Example: considering a CL group where CL B and CL C are submitted in 613 // parallel and neither of them succeeds: 614 // A (submitted) 615 // | 616 // |--> B (failed) --> D (pending) 617 // | 618 // |--> C (failed) --> E (pending) 619 // the message CV posts on CL D should only include the fact that CV fails 620 // to submit CL B. 621 for _, f := range failed { 622 fmt.Fprintf(&sb, "\n* %s", f.ExternalID.MustURL()) 623 } 624 fmt.Fprint(&sb, "\n\n") 625 fmt.Fprint(&sb, msgSuffix) 626 pendingMeta := reviewInputMeta{ 627 notify: whoms, 628 message: fmt.Sprintf("%s%s", partiallySubmittedMsgForPendingCLs, sb.String()), 629 addToAttention: whoms, 630 reason: submissionFailureAttentionReason, 631 } 632 for _, pendingCL := range pending { 633 metas[pendingCL.ID] = pendingMeta 634 } 635 scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED) 636 637 // Post messages to submitted CLs. 638 // TODO(yiwzhang): model message posting as a long op. 639 if len(submitted) > 0 { 640 msg := fmt.Sprintf("%s%s", partiallySubmittedMsgForSubmittedCLs, sb.String()) 641 err := parallel.FanOutIn(func(workCh chan<- func() error) { 642 for _, submittedCL := range submitted { 643 submittedCL := submittedCL 644 workCh <- func() error { 645 // Swallow the error because these messages are not critical and 646 // it's okay to not posting them during a Gerrit hiccup. 647 if err := postMsgForDependentFailures(ctx, impl.GFactory, submittedCL, msg); err != nil { 648 logging.Warningf(ctx, "failed to post messages for dependent submission failure on submitted changes: %s", err) 649 } 650 return nil 651 } 652 } 653 }) 654 if err != nil { 655 panic(fmt.Errorf("impossible parallel error: %w", err)) 656 } 657 } 658 case sc.GetTimeout(): 659 meta := reviewInputMeta{ 660 notify: whoms, 661 message: fmt.Sprintf("%s\n\n%s", timeoutMsg, msgSuffix), 662 addToAttention: whoms, 663 reason: submissionFailureAttentionReason, 664 } 665 for _, cl := range pending { 666 metas[cl.ID] = meta 667 } 668 default: 669 meta := reviewInputMeta{ 670 notify: whoms, 671 message: fmt.Sprintf("%s\n\n%s", defaultMsg, msgSuffix), 672 addToAttention: whoms, 673 reason: submissionFailureAttentionReason, 674 } 675 for _, cl := range pending { 676 metas[cl.ID] = meta 677 } 678 } 679 scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED) 680 } 681 682 func makeSubmissionMsgSuffix(submitted, failed, pending []*run.RunCL) string { 683 submittedURLs := make([]string, len(submitted)) 684 for i, cl := range submitted { 685 submittedURLs[i] = cl.ExternalID.MustURL() 686 } 687 notSubmittedURLs := make([]string, len(failed)+len(pending)) 688 for i, cl := range failed { 689 notSubmittedURLs[i] = cl.ExternalID.MustURL() 690 } 691 for i, cl := range pending { 692 notSubmittedURLs[len(failed)+i] = cl.ExternalID.MustURL() 693 } 694 if len(submittedURLs) > 0 { // partial submission 695 return fmt.Sprintf(partiallySubmittedMsgSuffixFmt, 696 strings.Join(notSubmittedURLs, "\n* "), 697 strings.Join(submittedURLs, "\n* "), 698 ) 699 } 700 return fmt.Sprintf(noneSubmittedMsgSuffixFmt, strings.Join(notSubmittedURLs, "\n* ")) 701 } 702 703 //////////////////////////////////////////////////////////////////////////////// 704 // Helper methods 705 706 var fakeTaskIDKey = "used in handler tests only for setting the mock taskID" 707 708 func mustTaskIDFromContext(ctx context.Context) string { 709 if taskID, ok := ctx.Value(&fakeTaskIDKey).(string); ok { 710 return taskID 711 } 712 switch executionInfo := tq.TaskExecutionInfo(ctx); { 713 case executionInfo == nil: 714 panic("must be called within a task handler") 715 case executionInfo.TaskID == "": 716 panic("taskID in task executionInfo is empty") 717 default: 718 return executionInfo.TaskID 719 } 720 } 721 func orderCLsInSubmissionOrder(ctx context.Context, cls []*run.RunCL, runID common.RunID, sub *run.Submission) ([]int64, error) { 722 cls, err := submit.ComputeOrder(cls) 723 if err != nil { 724 return nil, err 725 } 726 ret := make([]int64, len(cls)) 727 for i, cl := range cls { 728 ret[i] = int64(cl.ID) 729 } 730 return ret, nil 731 } 732 733 func hasCrOSKernelChange(luciProject string, cls []*run.RunCL) bool { 734 if luciProject != "chromeos" { 735 return false 736 } 737 for _, cl := range cls { 738 if cl.Detail.GetGerrit().GetInfo().GetProject() == "chromiumos/third_party/kernel" { 739 return true 740 } 741 } 742 return false 743 } 744 func splitRunCLs(cls []*run.RunCL, submission *run.Submission, sc *eventpb.SubmissionCompleted) (submitted, failed, pending []*run.RunCL) { 745 submittedSet := common.MakeCLIDsSet(submission.GetSubmittedCls()...) 746 failedSet := make(common.CLIDsSet, len(sc.GetClFailures().GetFailures())) 747 for _, f := range sc.GetClFailures().GetFailures() { 748 if submittedSet.HasI64(f.GetClid()) { 749 panic(fmt.Errorf("impossible; cl %d is marked both submitted and failed", f.GetClid())) 750 } 751 failedSet.AddI64(f.GetClid()) 752 } 753 754 submitted = make([]*run.RunCL, 0, len(submittedSet)) 755 failed = make([]*run.RunCL, 0, len(failedSet)) 756 pending = make([]*run.RunCL, 0, len(cls)-len(submittedSet)-len(failedSet)) 757 for _, cl := range cls { 758 switch { 759 case submittedSet.Has(cl.ID): 760 submitted = append(submitted, cl) 761 case failedSet.Has(cl.ID): 762 failed = append(failed, cl) 763 default: 764 pending = append(pending, cl) 765 } 766 } 767 return submitted, failed, pending 768 } 769 770 // TODO(crbug/1302119): Replace terms like "Project admin" with dedicated 771 // contact sourced from Project Config. 772 const ( 773 cvBugLink = "https://issues.chromium.org/issues/new?component=1456237" 774 defaultMsg = "Submission of this CL failed due to unexpected internal " + 775 "error. Please contact LUCI team.\n\n" + cvBugLink 776 noneSubmittedMsgSuffixFmt = "None of the CLs in the Run has been " + 777 "submitted. CLs:\n* %s" 778 partiallySubmittedMsgForRootCL = "Failed to submit the following CL(s):" 779 partiallySubmittedMsgForPendingCLs = "This CL is not submitted because " + 780 "submission has failed for the following CL(s) which this CL depends on." 781 partiallySubmittedMsgForSubmittedCLs = "This CL is submitted. However, " + 782 "submission has failed for the following CL(s) which depend on this CL." 783 partiallySubmittedMsgSuffixFmt = "CLs in the Run have been submitted " + 784 "partially.\nNot submitted:\n* %s\nSubmitted:\n* %s\n" + 785 "Please, use your judgement to determine if already submitted CLs have " + 786 "to be reverted, or if the remaining CLs could be manually submitted. " + 787 "If you think the partially submitted CLs may have broken the " + 788 "tip-of-tree of your project, consider notifying your infrastructure " + 789 "team/gardeners/sheriffs." 790 timeoutMsg = "Ran out of time to submit this CL. " + 791 // TODO(yiwzhang): Generally, time out means CV is doing something 792 // wrong and looping over internally, However, timeout could also 793 // happen when submitting large CL stack and Gerrit is slow. In that 794 // case, CV can't do anything about it. After launching m1, gather data 795 // to see under what circumstance it may happen and revise this message 796 // so that CV doesn't get blamed for timeout it isn't responsible for. 797 "Please contact LUCI team.\n\n" + cvBugLink 798 persistentTreeStatusAppFailureTemplate = "Could not submit this CL " + 799 "because the tree status app at %s repeatedly returned failures. " 800 treeStatusCheckFailedReason = "Tree status check failed." 801 submissionFailureAttentionReason = "Submission failed." 802 ) 803 804 // postMsgForDependentFailures posts a review message to 805 // a given CL to notify submission failures of the dependent CLs. 806 func postMsgForDependentFailures(ctx context.Context, gf gerrit.Factory, rcl *run.RunCL, msg string) error { 807 queryOpts := []gerritpb.QueryOption{gerritpb.QueryOption_MESSAGES} 808 posted, err := util.IsActionTakenOnGerritCL(ctx, gf, rcl, queryOpts, func(rcl *run.RunCL, ci *gerritpb.ChangeInfo) time.Time { 809 // In practice, Gerrit generally orders the messages from earliest to 810 // latest. So iterating in the reverse order because it's more likely the 811 // desired message is posted recently. Also don't visit any messages before 812 // the run trigger as those messages should belong to previous Runs. 813 clTriggeredAt := rcl.Trigger.Time.AsTime() 814 for i := len(ci.GetMessages()) - 1; i >= 0; i-- { 815 m := ci.GetMessages()[i] 816 switch t := m.GetDate().AsTime(); { 817 case t.Before(clTriggeredAt): 818 // i-th message is too old, no need to check even older ones. 819 return time.Time{} 820 case strings.Contains(m.GetMessage(), msg): 821 return t 822 } 823 } 824 return time.Time{} 825 }) 826 827 switch { 828 case err != nil: 829 return err 830 case !posted.IsZero(): 831 return nil 832 } 833 834 ci := rcl.Detail.GetGerrit().GetInfo() 835 ownerAndVotersAccounts := gerrit.Whoms{gerrit.Whom_OWNER, gerrit.Whom_CQ_VOTERS}.ToAccountIDsSorted(ci) 836 req := &gerritpb.SetReviewRequest{ 837 Number: ci.GetNumber(), 838 Project: ci.GetProject(), 839 RevisionId: ci.GetCurrentRevision(), 840 Message: msg, 841 Tag: run.FullRun.GerritMessageTag(), 842 Notify: gerritpb.Notify_NOTIFY_NONE, 843 NotifyDetails: &gerritpb.NotifyDetails{ 844 Recipients: []*gerritpb.NotifyDetails_Recipient{ 845 { 846 RecipientType: gerritpb.NotifyDetails_RECIPIENT_TYPE_TO, 847 Info: &gerritpb.NotifyDetails_Info{ 848 Accounts: ownerAndVotersAccounts, 849 }, 850 }, 851 }, 852 }, 853 AddToAttentionSet: make([]*gerritpb.AttentionSetInput, len(ownerAndVotersAccounts)), 854 } 855 reason := fmt.Sprintf("ps#%d: failed to submit dependent CLs", 856 ci.GetRevisions()[ci.GetCurrentRevision()].GetNumber()) 857 for i, acct := range ownerAndVotersAccounts { 858 req.AddToAttentionSet[i] = &gerritpb.AttentionSetInput{ 859 User: strconv.Itoa(int(acct)), 860 Reason: reason, 861 } 862 } 863 return util.MutateGerritCL(ctx, gf, rcl, req, 1*time.Minute, "post-msg-for-dependent-failure") 864 }